diff --git a/.gitignore b/.gitignore index 595187a..a7b16eb 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ artifacts/*.db logs/*.log reports/*.pdf reports/*.html +reports/*.docx # Keep .gitkeep files !.gitkeep diff --git a/CITATION.cff b/CITATION.cff index 58309b3..2e72c02 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,8 +1,8 @@ cff-version: 1.2.0 message: "If you use this software, please cite it as below." title: "NHANES Periodontitis Prediction Benchmark" -version: 1.3.1 -date-released: 2026-06-02 +version: 1.3.2 +date-released: 2026-06-04 url: "https://github.com/Tuminha/NHANES-Periodontitis-Machine-Learning-Project" repository-code: "https://github.com/Tuminha/NHANES-Periodontitis-Machine-Learning-Project" license: MIT @@ -11,7 +11,13 @@ authors: - family-names: "Barbosa" given-names: "Francisco Teixeira" email: "cisco@periospot.com" - affiliation: "Oral Rehabilitation Foundation" + affiliation: "Foundation for Oral Rehabilitation; DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University" + - family-names: "Brizuela-Velasco" + given-names: "Aritza" + affiliation: "DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University" + - family-names: "Robles Cantero" + given-names: "Daniel" + affiliation: "DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University" keywords: - periodontitis - machine learning @@ -33,5 +39,9 @@ preferred-citation: - family-names: "Barbosa" given-names: "Francisco Teixeira" email: "cisco@periospot.com" + - family-names: "Brizuela-Velasco" + given-names: "Aritza" + - family-names: "Robles Cantero" + given-names: "Daniel" title: "Machine Learning for Periodontitis Prediction: A Realistic Benchmark with Same-Source Temporal Validation on NHANES" year: 2026 diff --git a/MODEL_CARD.md b/MODEL_CARD.md index ce6fd14..60731ad 100644 --- a/MODEL_CARD.md +++ b/MODEL_CARD.md @@ -78,4 +78,4 @@ The consistency check enforces agreement between result artifacts, README, this ## AI-Use Disclosure -AI systems were used as drafting and code-review aids during project development. The author remains responsible for study design, code, analysis decisions, interpretation, and manuscript claims. +AI systems were used as drafting, figure-label review, critique, and code-review aids during project development. No AI system is listed as an author. The authors remain responsible for study design, code, analysis decisions, interpretation, and manuscript claims. diff --git a/Makefile b/Makefile index 2941f76..7667439 100644 --- a/Makefile +++ b/Makefile @@ -111,7 +111,7 @@ notebook: figures: @echo "Generating publication figures..." - @echo "Figure regeneration remains notebook-backed; see notebooks/00_nhanes_periodontitis_end_to_end.ipynb" + $(PYTHON) scripts/06_generate_publication_figures.py manuscript: @echo "Rendering manuscript if pandoc is installed..." @@ -128,6 +128,9 @@ manuscript: exit 1; \ fi; \ mkdir -p reports; \ + pandoc docs/publication/ARTICLE_DRAFT.md \ + -o reports/manuscript_bmc_submission.docx; \ + echo "Rendered reports/manuscript_bmc_submission.docx"; \ pandoc docs/publication/ARTICLE_DRAFT.md \ --pdf-engine=$$ENGINE \ -V geometry:margin=1in \ @@ -145,7 +148,7 @@ clean: rm -rf artifacts/*.npy rm -rf artifacts/*.db rm -rf logs/*.log - rm -rf reports/*.pdf reports/*.html + rm -rf reports/*.pdf reports/*.html reports/*.docx @echo "Clean complete" lock: diff --git a/README.md b/README.md index 3d58d89..979d4db 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,10 @@ The legacy notebooks are retired as source-of-truth artifacts. The maintained pu | `scripts/reproduce_v13_primary.py` | Regenerates internal v1.3 benchmark result artifacts | | `scripts/run_temporal_validation.py` | Regenerates same-source temporal validation artifacts | | `scripts/04_publication_analyses.py` | Generates publication sensitivity tables from processed predictions | +| `scripts/06_generate_publication_figures.py` | Generates submission figures from canonical result artifacts | | `results/publication_sensitivity_tables.md` | Survey-weighted prevalence and subgroup performance summary generated by the full reproduction | +| `figures/19_publication_performance_summary.png` | Main performance and operating-point figure | +| `figures/20_publication_sensitivity_summary.png` | Survey-weighted prevalence, subgroup AUC, and missingness figure | | `results/` | Saved result artifacts used by the manuscript and model card | | `docs/publication/ARTICLE_DRAFT.md` | Current manuscript source | @@ -90,7 +93,7 @@ The legacy notebooks are retired as source-of-truth artifacts. The maintained pu ```bibtex @software{barbosa_nhanes_periodontitis_benchmark, - author = {Barbosa, Francisco Teixeira}, + author = {Barbosa, Francisco Teixeira and Brizuela-Velasco, Aritza and Robles Cantero, Daniel}, title = {NHANES Periodontitis Prediction Benchmark}, year = {2026}, url = {https://github.com/Tuminha/NHANES-Periodontitis-Machine-Learning-Project} diff --git a/docs/publication/ARTICLE_DRAFT.md b/docs/publication/ARTICLE_DRAFT.md index 176f789..fd3c5ae 100644 --- a/docs/publication/ARTICLE_DRAFT.md +++ b/docs/publication/ARTICLE_DRAFT.md @@ -1,8 +1,18 @@ # Machine Learning for Periodontitis Prediction: A Realistic Benchmark with Same-Source Temporal Validation on NHANES -**Draft version:** publication-readiness repair, June 2026 -**Article type:** prediction-model methodology benchmark -**Reporting target:** TRIPOD+AI-aligned development and validation report +**Authors:** Francisco Teixeira Barbosa^1,2*, Aritza Brizuela-Velasco^2, Daniel Robles Cantero^2 + +**Affiliations:** + +1. Foundation for Oral Rehabilitation (FOR), Werftestrasse 4, 6002 Luzern, Switzerland. +2. DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University (UEMC), Padre Julio Chevalier 2, 47012 Valladolid, Spain. + +**Corresponding author:** Francisco Teixeira Barbosa, cisco@periospot.com + +- **Draft version:** BMC Oral Health submission preparation, June 2026 +- **Target journal:** BMC Oral Health +- **Article type:** Research article +- **Reporting target:** TRIPOD+AI-aligned development and validation report ## Abstract @@ -14,9 +24,11 @@ **Conclusions:** Low-cost NHANES predictors appear to support realistic discrimination around 0.69 internally and 0.65 under same-source temporal validation. The study is best interpreted as a benchmark and cautionary methods report, not as evidence that a deployable diagnostic screening model has been established. Geographic validation, prospective clinical validation, local recalibration, and subgroup calibration remain necessary before implementation claims. +**Trial registration:** Not applicable. + **Keywords:** periodontitis; NHANES; prediction model; gradient boosting; calibration; missing data; TRIPOD+AI -## 1. Introduction +## Background Periodontitis is common among adults and is often identified only after irreversible tissue destruction has occurred. Low-cost risk stratification is attractive because full periodontal examination requires trained personnel, examination time, and access to dental care. However, a model built from demographic, behavioral, anthropometric, and metabolic predictors is not equivalent to clinical examination. @@ -30,27 +42,27 @@ The objectives were to: 4. Assess missingness indicators and a deployment-ready feature set without NHANES-specific missingness flags. 5. Reframe the model using prediction-model reporting standards and explicit limitations. -## 2. Methods +## Methods -### 2.1 Study Design and Data Source +### Study Design and Data Source This was a prediction-model benchmark using public NHANES data. The development cohort comprised NHANES 2011-2012 and 2013-2014. The same-source temporal validation cohort comprised NHANES 2009-2010. Later NHANES cycles were not used for temporal validation because full-mouth periodontal measurements required for CDC/AAP classification were discontinued. NHANES data are publicly available and de-identified. This secondary analysis did not require institutional review board approval. -### 2.2 Participants +### Participants Eligible participants were adults age 30 years or older with full periodontal examination data and sufficient information for CDC/AAP periodontitis classification. The development cohort included 9,034 participants. The same-source temporal validation cohort included 5,037 participants. The analytic prevalence of periodontitis was approximately 69-72% unweighted and approximately 66% after applying examination weights by cycle, higher than general-population CDC estimates. This reflects the restricted full-examination analytic sample and should not be interpreted as the expected prevalence in a lower-risk screening population. -### 2.3 Outcome Definition +### Outcome Definition The binary outcome was any periodontitis versus no periodontitis using CDC/AAP definitions. Severe, moderate, and mild classifications were assigned hierarchically from interproximal pocket depth and clinical attachment loss measurements, excluding third molars and enforcing different-teeth criteria where specified. The repository now includes synthetic tests for severe, moderate, mild, and no-periodontitis cases, third-molar exclusion, and different-teeth logic. -### 2.4 Predictors and Feature Sets +### Predictors and Feature Sets The primary model used 29 predictors after excluding treatment-seeking variables that can be downstream of disease: @@ -63,29 +75,29 @@ The secondary model used 33 predictors by restoring those variables. The seconda Predictor categories included demographics, smoking and alcohol variables, anthropometric measures, blood pressure, fasting glucose, triglycerides, HDL, and missingness indicators. -### 2.5 Missing Data +### Missing Data NHANES missingness is not purely random. Fasting laboratory variables are collected in subsamples, and questionnaire items may follow skip-pattern logic. Tree models were allowed to handle missing values natively, and missingness indicators were included in the primary model. A deployment-ready 15-feature model without missingness indicators was retained as a conservative lower-bound benchmark because NHANES-specific missingness may not transfer to clinical datasets. -### 2.6 Model Development and Calibration +### Model Development and Calibration Gradient boosting models were tuned using Optuna and evaluated with stratified 5-fold cross-validation. Monotonic constraints were applied to selected continuous variables where clinical priors were clear. Isotonic calibration was used for probability calibration in the cross-validation workflow. The final temporal evaluation used the frozen primary model and pre-specified thresholds from the development workflow. Thresholds were not re-optimized on the temporal validation cohort. -### 2.7 Statistical Analysis +### Statistical Analysis Discrimination was summarized with AUC-ROC and PR-AUC. Calibration was summarized with Brier score and reliability plots. Operating points were reported using sensitivity, specificity, PPV, and NPV. Decision-curve analysis was included as a descriptive utility analysis only. Survey-weighted prevalence and subgroup performance are generated by `scripts/04_publication_analyses.py` when processed prediction tables are available. The current regenerated summary is saved in `results/publication_sensitivity_tables.md`. -## 3. Results +## Results -### 3.1 Cohorts +### Cohorts The development cohort included 9,034 adults from NHANES 2011-2014. The temporal validation cohort included 5,037 adults from NHANES 2009-2010. Periodontitis prevalence was 70.9% in development data and 69.1% in temporal validation data before survey weighting. -### 3.2 Internal Model Performance +### Internal Model Performance | Model variant | Features | AUC-ROC | PR-AUC | Interpretation | |---|---:|---:|---:|---| @@ -95,7 +107,7 @@ The development cohort included 9,034 adults from NHANES 2011-2014. The temporal The full-feature model improved AUC by 0.0100 over the primary model. This small difference supports excluding treatment-seeking variables from the primary benchmark. -### 3.3 Same-Source Temporal Validation +### Same-Source Temporal Validation | Metric | Development estimate | Temporal validation estimate | |---|---:|---:| @@ -107,7 +119,7 @@ The full-feature model improved AUC by 0.0100 over the primary model. This small The drop from internal AUC 0.6896 to temporal AUC 0.6495 is consistent with a modest but meaningful generalization gap. -### 3.4 Operating Points +### Operating Points | Threshold | Sensitivity | Specificity | PPV | NPV | |---:|---:|---:|---:|---:| @@ -116,13 +128,21 @@ The drop from internal AUC 0.6896 to temporal AUC 0.6495 is consistent with a mo The 0.35 threshold prioritizes sensitivity but has very low specificity and an NPV of only 69.1% in this high-prevalence cohort. It should be described as a high-sensitivity triage operating point, not a reliable disease-exclusion rule. -### 3.5 Missingness and Survey-Design Sensitivity +Figure 1 summarizes the discrimination, calibration error, temporal operating points, and primary-versus-secondary feature-set comparison. + +![Figure 1. Model performance summary. Panel A shows internal and same-source temporal discrimination for the primary and secondary model variants. Panel B shows Brier score, where lower values indicate lower calibration error. Panel C shows sensitivity, specificity, PPV, and NPV for the frozen primary model in NHANES 2009-2010 at thresholds 0.35 and 0.65. Panel D shows that adding treatment-seeking variables increased the feature count from 29 to 33 and changed internal AUC by 0.0100.](figures/19_publication_performance_summary.png) + +### Missingness and Survey-Design Sensitivity Missingness indicators contributed limited but measurable predictive signal. Missingness patterns were broadly comparable between the development and temporal validation cohorts, but the signal may still be NHANES-specific. The deployment-ready 15-feature model remains the most conservative estimate for settings where NHANES missingness patterns are unavailable. Survey-weighted prevalence and subgroup-performance tables were generated from processed prediction data and are saved in `results/publication_sensitivity_tables.md`. Weighted prevalence was approximately 65.6% in 2009-2010 and 66.2-66.3% in 2011-2014. Subgroup analyses are descriptive and should not be overinterpreted as evidence of transportability. -## 4. Discussion +Figure 2 summarizes weighted prevalence, selected subgroup AUCs, and the highest missingness proportions among predictors. + +![Figure 2. Survey sensitivity summary. Panel A compares unweighted and survey-weighted periodontitis prevalence by NHANES cycle. Panel B shows temporal AUC-ROC across selected age, sex, smoking, and metabolic-risk subgroups, with the overall temporal AUC shown as a dashed reference line. Panel C shows the predictors with the highest missingness proportions, highlighting the fasting laboratory variables as the dominant source of missingness.](figures/20_publication_sensitivity_summary.png) + +## Discussion This repair changes the interpretation of the study. The main contribution is not a clinically ready model. The contribution is a reproducible benchmark showing that realistic performance for low-cost periodontitis predictors is far below highly optimistic internal estimates reported in some prior work. @@ -130,34 +150,60 @@ The same-source temporal validation result is important but limited. Because bot The operating points also require conservative interpretation. High sensitivity at threshold 0.35 comes at the cost of low specificity, and the NPV does not support reassuring individual patients without periodontal examination. -## 5. Limitations +## Limitations 1. The validation cohort is temporally distinct but not independent by geography, health system, or measurement program. 2. The analytic cohort is restricted to adults with full periodontal examination data and has high disease prevalence. 3. NHANES missingness indicators may encode survey logistics rather than transportable clinical information. -4. Survey-weighted and subgroup calibration tables must be regenerated before submission and interpreted carefully. +4. Survey-weighted and subgroup calibration tables are descriptive and should be interpreted carefully. 5. The model predicts current case status, not future incident periodontitis. 6. The manuscript does not establish patient benefit, treatment impact, or workflow safety. -## 6. Conclusions +## Conclusions The primary 29-feature model achieved AUC-ROC 0.6896 internally and 0.6495 under same-source temporal validation. A secondary 33-feature model provided only a small apparent gain, supporting exclusion of treatment-seeking variables from the preferred benchmark. These results support a cautious methodological conclusion: low-cost NHANES predictors provide moderate discrimination and are useful for benchmarking, but they do not establish a clinically ready periodontitis screening system. -## Data and Code Availability +## List of abbreviations + +AUC-ROC: area under the receiver operating characteristic curve; Brier: Brier score; CDC: Centers for Disease Control and Prevention; CI: confidence interval; FOR: Foundation for Oral Rehabilitation; NHANES: National Health and Nutrition Examination Survey; NCHS: National Center for Health Statistics; NPV: negative predictive value; PPV: positive predictive value; PR-AUC: area under the precision-recall curve; PROBAST+AI: Prediction model Risk Of Bias ASsessment Tool plus artificial intelligence; TRIPOD+AI: Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis plus artificial intelligence; UEMC: Miguel de Cervantes European University. + +## Declarations -Code and saved result artifacts are available at: . Raw NHANES data are available from the CDC. +### Ethics approval and consent to participate -## Funding +NHANES protocols were approved by the NCHS Research Ethics Review Board, and NHANES participants provided informed consent [8]. This secondary analysis used publicly available de-identified NHANES data and did not require additional local ethics approval. + +### Consent for publication + +Not applicable. The manuscript does not include individual person-level identifying data, images, or videos. + +### Availability of data and materials + +The datasets analyzed during the current study are publicly available from the CDC/NCHS NHANES website [7]. Code, tests, scripts, generated figures, and saved result artifacts are available in the project repository [6]. + +### Competing interests + +FTB is Executive Director of the Foundation for Oral Rehabilitation and founder/editor-in-chief of PerioSpot. ABV and DRC are affiliated with DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University. The authors report no financial competing interests directly related to the NHANES data, code, or periodontitis prediction model. + +### Funding No external funding was reported for this analysis. -## Conflicts of Interest +### Authors' contributions + +FTB conceived the study, implemented and verified the reproducible analysis workflow, generated figures, interpreted results, and drafted the manuscript. ABV contributed clinical and methodological interpretation, reviewed the oral-health framing, and reviewed the manuscript. DRC contributed clinical interpretation, reviewed the oral-health framing, and reviewed the manuscript. All authors approved the submitted manuscript. + +### Acknowledgements + +The authors acknowledge the Centers for Disease Control and Prevention, the National Center for Health Statistics, and the NHANES participants and staff who made the public-use data available. + +### Authors' information -The author declares no conflicts of interest. +Not applicable. -## AI-Use Disclosure +### AI-use disclosure -AI systems were used for drafting support, code review, and critique generation during manuscript development. The author reviewed and remains responsible for all analysis decisions, code changes, interpretation, and final claims. +AI systems were used for drafting support, code review, figure-label review, and critique generation during manuscript development. No AI system is listed as an author. The authors reviewed and remain responsible for all analysis decisions, code changes, interpretation, and final claims. ## References @@ -166,3 +212,6 @@ AI systems were used for drafting support, code review, and critique generation 3. Bashir NZ, Rahman Z, Chen SLS. Systematic comparison of machine learning algorithms to develop and validate predictive models for periodontitis. J Clin Periodontol. 2022;49:958-969. 4. Collins GS, Moons KGM, Dhiman P, et al. TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods. BMJ. 2024;385:e078378. 5. Moons KGM, Damen JAA, Kaul T, et al. PROBAST+AI: an updated quality, risk of bias, and applicability assessment tool for prediction models using regression or artificial intelligence methods. BMJ. 2025;388:e082505. +6. Barbosa FT, Brizuela-Velasco A, Robles Cantero D. NHANES Periodontitis Prediction Benchmark. GitHub. 2026. https://github.com/Tuminha/NHANES-Periodontitis-Machine-Learning-Project. Accessed 4 Jun 2026. +7. National Center for Health Statistics. National Health and Nutrition Examination Survey. Centers for Disease Control and Prevention. https://www.cdc.gov/nchs/nhanes/. Accessed 4 Jun 2026. +8. National Center for Health Statistics. Ethics Review Board Approval. Centers for Disease Control and Prevention. https://www.cdc.gov/nchs/nhanes/about/erb.html. Accessed 4 Jun 2026. diff --git a/docs/publication/BMC_ORAL_HEALTH_COVER_LETTER.md b/docs/publication/BMC_ORAL_HEALTH_COVER_LETTER.md new file mode 100644 index 0000000..39eaf40 --- /dev/null +++ b/docs/publication/BMC_ORAL_HEALTH_COVER_LETTER.md @@ -0,0 +1,22 @@ +# Cover Letter: BMC Oral Health + +Dear Editor, + +We are pleased to submit our manuscript, "Machine Learning for Periodontitis Prediction: A Realistic Benchmark with Same-Source Temporal Validation on NHANES," for consideration as a Research article in BMC Oral Health. + +This manuscript addresses a timely oral-health question: how much signal is realistically available from low-cost NHANES predictors for current periodontitis classification when model development is reported conservatively. Rather than presenting an artificial intelligence model as a deployable diagnostic tool, we frame the work as a transparent prediction-model benchmark. The study evaluates modern gradient boosting models, removes treatment-seeking variables from the primary model, applies calibration, performs same-source temporal validation on NHANES 2009-2010, and reports operating-point limitations, missingness sensitivity, survey-weighted prevalence, and subgroup performance. + +We believe the manuscript fits BMC Oral Health because it directly concerns oral-health epidemiology, digital dentistry, diagnostic methodology, and the prevention and management of disorders of the teeth and gums. Its main contribution is methodological validity and reproducibility: the repository includes maintained scripts, tests for CDC/AAP periodontitis classification logic, consistency checks linking result artifacts to manuscript claims, generated figures, and a reproducible full-analysis workflow. + +The findings are intentionally cautious. The primary model achieved internal AUC-ROC 0.6896 and same-source temporal AUC-ROC 0.6495. These results show that low-cost predictors may support research benchmarking and risk-stratification experiments, but do not establish a clinically ready screening or diagnostic system. We think this conservative conclusion is important because overly optimistic claims from internally validated machine-learning studies can mislead clinical interpretation. + +All authors have approved the submitted manuscript. The manuscript is original, is not under consideration elsewhere, and has not been published previously. The authors report no financial competing interests directly related to the NHANES data, code, or periodontitis prediction model. Potential non-financial interests are disclosed in the manuscript. + +Thank you for considering our submission. + +Sincerely, + +Francisco Teixeira Barbosa +Foundation for Oral Rehabilitation (FOR) +DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University +cisco@periospot.com diff --git a/docs/publication/BMC_SUBMISSION_CHECKLIST.md b/docs/publication/BMC_SUBMISSION_CHECKLIST.md new file mode 100644 index 0000000..99b2c15 --- /dev/null +++ b/docs/publication/BMC_SUBMISSION_CHECKLIST.md @@ -0,0 +1,55 @@ +# BMC Oral Health Submission Checklist + +Target journal: BMC Oral Health +Article type: Research article +Submission portal: https://submission.nature.com/ + +## Ready Files + +- Main manuscript source: `docs/publication/ARTICLE_DRAFT.md` +- Rendered PDF for review: `reports/manuscript_publication_repair.pdf` +- Line-numbered review text: `reports/ARTICLE_DRAFT_line_numbered.txt` +- Cover letter draft: `docs/publication/BMC_ORAL_HEALTH_COVER_LETTER.md` +- Figure 1 PNG/PDF: `figures/19_publication_performance_summary.*` +- Figure 2 PNG/PDF: `figures/20_publication_sensitivity_summary.*` +- Result tables: `results/publication_sensitivity_tables.md` +- Code/reproducibility repository: https://github.com/Tuminha/NHANES-Periodontitis-Machine-Learning-Project + +## Author Details To Confirm In The Submission Portal + +1. Francisco Teixeira Barbosa + - Affiliations: + - Foundation for Oral Rehabilitation (FOR), Werftestrasse 4, 6002 Luzern, Switzerland. + - DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University (UEMC), Padre Julio Chevalier 2, 47012 Valladolid, Spain. + - Corresponding author: yes + - Email: `cisco@periospot.com` + - ORCID: add if available + +2. Aritza Brizuela-Velasco + - Affiliation: DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University (UEMC), Padre Julio Chevalier 2, 47012 Valladolid, Spain. + - Email: add in submission portal + - ORCID: add if available + +3. Daniel Robles Cantero + - Affiliation: DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University (UEMC), Padre Julio Chevalier 2, 47012 Valladolid, Spain. + - Email: add in submission portal + - ORCID: add if available + +## Human Checks Before Clicking Submit + +- Confirm Aritza and Daniel approve the final rendered manuscript, not only authorship and affiliation. +- Confirm author order: Francisco first, Aritza second, Daniel last. +- Confirm whether UEMC/DENS-iA requires a specific institutional spelling or additional department name. +- Confirm APC funding route, waiver, or institutional open-access agreement. +- Confirm no author has an undisclosed financial or non-financial competing interest. +- Confirm the submitted cover letter wording is acceptable to all authors. + +## Journal-Specific Checks Already Addressed + +- Structured abstract with Background, Methods, Results, Conclusions, and Trial registration. +- Title page with full names, affiliations, and corresponding author. +- BMC-style Declarations section with all required subheadings. +- Line-numbered manuscript review file. +- Figures generated from current result artifacts with legends in the manuscript. +- Public NHANES ethics and data availability statements. +- AI-use disclosure stating that no AI system is an author and authors remain responsible. diff --git a/figures/19_publication_performance_summary.pdf b/figures/19_publication_performance_summary.pdf new file mode 100644 index 0000000..941aa01 Binary files /dev/null and b/figures/19_publication_performance_summary.pdf differ diff --git a/figures/19_publication_performance_summary.png b/figures/19_publication_performance_summary.png new file mode 100644 index 0000000..6b9aebb Binary files /dev/null and b/figures/19_publication_performance_summary.png differ diff --git a/figures/20_publication_sensitivity_summary.pdf b/figures/20_publication_sensitivity_summary.pdf new file mode 100644 index 0000000..8a4dc18 Binary files /dev/null and b/figures/20_publication_sensitivity_summary.pdf differ diff --git a/figures/20_publication_sensitivity_summary.png b/figures/20_publication_sensitivity_summary.png new file mode 100644 index 0000000..152c176 Binary files /dev/null and b/figures/20_publication_sensitivity_summary.png differ diff --git a/reports/ARTICLE_DRAFT_line_numbered.txt b/reports/ARTICLE_DRAFT_line_numbered.txt index 2d6abf4..1b0ab19 100644 --- a/reports/ARTICLE_DRAFT_line_numbered.txt +++ b/reports/ARTICLE_DRAFT_line_numbered.txt @@ -1,168 +1,217 @@ 0001 # Machine Learning for Periodontitis Prediction: A Realistic Benchmark with Same-Source Temporal Validation on NHANES -0002 -0003 **Draft version:** publication-readiness repair, June 2026 -0004 **Article type:** prediction-model methodology benchmark -0005 **Reporting target:** TRIPOD+AI-aligned development and validation report -0006 -0007 ## Abstract -0008 -0009 **Background:** Machine-learning studies for periodontitis prediction have reported very high internal discrimination using low-cost NHANES predictors. Such estimates require careful reassessment because internal validation, missing-data handling, calibration, and treatment-seeking predictors can materially change apparent performance. -0010 -0011 **Methods:** We analyzed NHANES 2011-2014 adults age 30 years or older with full periodontal examinations (`n=9,034`). Periodontitis was classified using CDC/AAP case definitions. XGBoost, LightGBM, and CatBoost were compared using stratified 5-fold cross-validation. The primary calibrated ensemble excluded treatment-seeking variables and used 29 predictors. A secondary 33-feature model restored dental visit, flossing, loose teeth, and floss-missing variables to estimate their incremental contribution. The frozen primary model was evaluated on NHANES 2009-2010 (`n=5,037`) as same-source temporal validation. Missingness ablations and deployment-ready feature analysis were used to assess whether NHANES missingness patterns contributed survey-specific signal. -0012 -0013 **Results:** The primary 29-feature model achieved internal AUC-ROC 0.6896 and PR-AUC 0.8240. The secondary 33-feature model achieved AUC-ROC 0.6996 and PR-AUC 0.8295, indicating that treatment-seeking variables added 0.0100 AUC. In same-source temporal validation, the frozen primary model achieved AUC-ROC 0.6495, PR-AUC 0.7727, and Brier score 0.2023. At threshold 0.35, sensitivity was 98.9% and specificity was 5.5%; at threshold 0.65, sensitivity was 77.7% and specificity was 45.2%. These operating points are not diagnostic rules and require periodontal examination for confirmation. -0014 -0015 **Conclusions:** Low-cost NHANES predictors appear to support realistic discrimination around 0.69 internally and 0.65 under same-source temporal validation. The study is best interpreted as a benchmark and cautionary methods report, not as evidence that a deployable diagnostic screening model has been established. Geographic validation, prospective clinical validation, local recalibration, and subgroup calibration remain necessary before implementation claims. -0016 -0017 **Keywords:** periodontitis; NHANES; prediction model; gradient boosting; calibration; missing data; TRIPOD+AI -0018 -0019 ## 1. Introduction -0020 -0021 Periodontitis is common among adults and is often identified only after irreversible tissue destruction has occurred. Low-cost risk stratification is attractive because full periodontal examination requires trained personnel, examination time, and access to dental care. However, a model built from demographic, behavioral, anthropometric, and metabolic predictors is not equivalent to clinical examination. -0022 -0023 Prior machine-learning work using NHANES has reported very high apparent discrimination. This manuscript reassesses the likely performance ceiling under stricter validation and reporting expectations. The central question is not whether a model can replace periodontal examination, but whether low-cost NHANES predictors contain enough signal to support reliable risk stratification and what methodological choices inflate or constrain that estimate. -0024 -0025 The objectives were to: -0026 -0027 1. Estimate realistic internal performance for modern gradient boosting models. -0028 2. Evaluate performance on an earlier NHANES cycle using a frozen model. -0029 3. Quantify the effect of treatment-seeking variables that may reflect existing disease. -0030 4. Assess missingness indicators and a deployment-ready feature set without NHANES-specific missingness flags. -0031 5. Reframe the model using prediction-model reporting standards and explicit limitations. -0032 -0033 ## 2. Methods -0034 -0035 ### 2.1 Study Design and Data Source -0036 -0037 This was a prediction-model benchmark using public NHANES data. The development cohort comprised NHANES 2011-2012 and 2013-2014. The same-source temporal validation cohort comprised NHANES 2009-2010. Later NHANES cycles were not used for temporal validation because full-mouth periodontal measurements required for CDC/AAP classification were discontinued. -0038 -0039 NHANES data are publicly available and de-identified. This secondary analysis did not require institutional review board approval. -0040 -0041 ### 2.2 Participants -0042 -0043 Eligible participants were adults age 30 years or older with full periodontal examination data and sufficient information for CDC/AAP periodontitis classification. The development cohort included 9,034 participants. The same-source temporal validation cohort included 5,037 participants. -0044 -0045 The analytic prevalence of periodontitis was approximately 69-72% unweighted and approximately 66% after applying examination weights by cycle, higher than general-population CDC estimates. This reflects the restricted full-examination analytic sample and should not be interpreted as the expected prevalence in a lower-risk screening population. -0046 -0047 ### 2.3 Outcome Definition -0048 -0049 The binary outcome was any periodontitis versus no periodontitis using CDC/AAP definitions. Severe, moderate, and mild classifications were assigned hierarchically from interproximal pocket depth and clinical attachment loss measurements, excluding third molars and enforcing different-teeth criteria where specified. -0050 -0051 The repository now includes synthetic tests for severe, moderate, mild, and no-periodontitis cases, third-molar exclusion, and different-teeth logic. -0052 -0053 ### 2.4 Predictors and Feature Sets -0054 -0055 The primary model used 29 predictors after excluding treatment-seeking variables that can be downstream of disease: -0056 -0057 - `dental_visit` -0058 - `floss_days` -0059 - `mobile_teeth` -0060 - `floss_days_missing` -0061 -0062 The secondary model used 33 predictors by restoring those variables. The secondary model is reported only as an upper-bound sensitivity analysis, not as the preferred screening model. -0063 -0064 Predictor categories included demographics, smoking and alcohol variables, anthropometric measures, blood pressure, fasting glucose, triglycerides, HDL, and missingness indicators. -0065 -0066 ### 2.5 Missing Data -0067 -0068 NHANES missingness is not purely random. Fasting laboratory variables are collected in subsamples, and questionnaire items may follow skip-pattern logic. Tree models were allowed to handle missing values natively, and missingness indicators were included in the primary model. A deployment-ready 15-feature model without missingness indicators was retained as a conservative lower-bound benchmark because NHANES-specific missingness may not transfer to clinical datasets. -0069 -0070 ### 2.6 Model Development and Calibration -0071 -0072 Gradient boosting models were tuned using Optuna and evaluated with stratified 5-fold cross-validation. Monotonic constraints were applied to selected continuous variables where clinical priors were clear. Isotonic calibration was used for probability calibration in the cross-validation workflow. -0073 -0074 The final temporal evaluation used the frozen primary model and pre-specified thresholds from the development workflow. Thresholds were not re-optimized on the temporal validation cohort. -0075 -0076 ### 2.7 Statistical Analysis -0077 -0078 Discrimination was summarized with AUC-ROC and PR-AUC. Calibration was summarized with Brier score and reliability plots. Operating points were reported using sensitivity, specificity, PPV, and NPV. Decision-curve analysis was included as a descriptive utility analysis only. -0079 -0080 Survey-weighted prevalence and subgroup performance are generated by `scripts/04_publication_analyses.py` when processed prediction tables are available. The current regenerated summary is saved in `results/publication_sensitivity_tables.md`. -0081 -0082 ## 3. Results -0083 -0084 ### 3.1 Cohorts -0085 -0086 The development cohort included 9,034 adults from NHANES 2011-2014. The temporal validation cohort included 5,037 adults from NHANES 2009-2010. Periodontitis prevalence was 70.9% in development data and 69.1% in temporal validation data before survey weighting. -0087 -0088 ### 3.2 Internal Model Performance -0089 -0090 | Model variant | Features | AUC-ROC | PR-AUC | Interpretation | -0091 |---|---:|---:|---:|---| -0092 | Primary model without treatment-seeking variables | 29 | 0.6896 | 0.8240 | Preferred benchmark model | -0093 | Secondary full-feature model | 33 | 0.6996 | 0.8295 | Upper-bound sensitivity analysis | -0094 | Deployment-ready core model | 15 | 0.6896 | 0.8237 | Conservative feature set without missingness indicators | -0095 -0096 The full-feature model improved AUC by 0.0100 over the primary model. This small difference supports excluding treatment-seeking variables from the primary benchmark. -0097 -0098 ### 3.3 Same-Source Temporal Validation -0099 -0100 | Metric | Development estimate | Temporal validation estimate | -0101 |---|---:|---:| -0102 | N | 9,034 | 5,037 | -0103 | Prevalence | 70.9% | 69.1% | -0104 | AUC-ROC | 0.6896 | 0.6495 (95% CI 0.6315-0.6664) | -0105 | PR-AUC | 0.8240 | 0.7727 (95% CI 0.7570-0.7885) | -0106 | Brier score | 0.1871 | 0.2023 (95% CI 0.1955-0.2085) | -0107 -0108 The drop from internal AUC 0.6896 to temporal AUC 0.6495 is consistent with a modest but meaningful generalization gap. -0109 -0110 ### 3.4 Operating Points -0111 -0112 | Threshold | Sensitivity | Specificity | PPV | NPV | -0113 |---:|---:|---:|---:|---:| -0114 | 0.35 | 98.9% | 5.5% | 70.0% | 69.1% | -0115 | 0.65 | 77.7% | 45.2% | 76.0% | 47.5% | -0116 -0117 The 0.35 threshold prioritizes sensitivity but has very low specificity and an NPV of only 69.1% in this high-prevalence cohort. It should be described as a high-sensitivity triage operating point, not a reliable disease-exclusion rule. -0118 -0119 ### 3.5 Missingness and Survey-Design Sensitivity -0120 -0121 Missingness indicators contributed limited but measurable predictive signal. Missingness patterns were broadly comparable between the development and temporal validation cohorts, but the signal may still be NHANES-specific. The deployment-ready 15-feature model remains the most conservative estimate for settings where NHANES missingness patterns are unavailable. -0122 -0123 Survey-weighted prevalence and subgroup-performance tables were generated from processed prediction data and are saved in `results/publication_sensitivity_tables.md`. Weighted prevalence was approximately 65.6% in 2009-2010 and 66.2-66.3% in 2011-2014. Subgroup analyses are descriptive and should not be overinterpreted as evidence of transportability. -0124 -0125 ## 4. Discussion -0126 -0127 This repair changes the interpretation of the study. The main contribution is not a clinically ready model. The contribution is a reproducible benchmark showing that realistic performance for low-cost periodontitis predictors is far below highly optimistic internal estimates reported in some prior work. -0128 -0129 The same-source temporal validation result is important but limited. Because both cohorts come from NHANES, performance does not establish geographic transportability, prospective clinical performance, or behavior in lower-prevalence screening populations. -0130 -0131 The operating points also require conservative interpretation. High sensitivity at threshold 0.35 comes at the cost of low specificity, and the NPV does not support reassuring individual patients without periodontal examination. -0132 -0133 ## 5. Limitations -0134 -0135 1. The validation cohort is temporally distinct but not independent by geography, health system, or measurement program. -0136 2. The analytic cohort is restricted to adults with full periodontal examination data and has high disease prevalence. -0137 3. NHANES missingness indicators may encode survey logistics rather than transportable clinical information. -0138 4. Survey-weighted and subgroup calibration tables must be regenerated before submission and interpreted carefully. -0139 5. The model predicts current case status, not future incident periodontitis. -0140 6. The manuscript does not establish patient benefit, treatment impact, or workflow safety. -0141 -0142 ## 6. Conclusions -0143 -0144 The primary 29-feature model achieved AUC-ROC 0.6896 internally and 0.6495 under same-source temporal validation. A secondary 33-feature model provided only a small apparent gain, supporting exclusion of treatment-seeking variables from the preferred benchmark. These results support a cautious methodological conclusion: low-cost NHANES predictors provide moderate discrimination and are useful for benchmarking, but they do not establish a clinically ready periodontitis screening system. -0145 -0146 ## Data and Code Availability -0147 -0148 Code and saved result artifacts are available at: . Raw NHANES data are available from the CDC. -0149 -0150 ## Funding -0151 -0152 No external funding was reported for this analysis. -0153 -0154 ## Conflicts of Interest -0155 -0156 The author declares no conflicts of interest. -0157 -0158 ## AI-Use Disclosure -0159 -0160 AI systems were used for drafting support, code review, and critique generation during manuscript development. The author reviewed and remains responsible for all analysis decisions, code changes, interpretation, and final claims. -0161 -0162 ## References -0163 -0164 1. Eke PI, Page RC, Wei L, Thornton-Evans G, Genco RJ. Update of the case definitions for population-based surveillance of periodontitis. J Periodontol. 2012;83(12):1449-1454. -0165 2. Eke PI, Dye BA, Wei L, et al. Update on prevalence of periodontitis in adults in the United States: NHANES 2009-2012. J Periodontol. 2015;86(5):611-622. -0166 3. Bashir NZ, Rahman Z, Chen SLS. Systematic comparison of machine learning algorithms to develop and validate predictive models for periodontitis. J Clin Periodontol. 2022;49:958-969. -0167 4. Collins GS, Moons KGM, Dhiman P, et al. TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods. BMJ. 2024;385:e078378. -0168 5. Moons KGM, Damen JAA, Kaul T, et al. PROBAST+AI: an updated quality, risk of bias, and applicability assessment tool for prediction models using regression or artificial intelligence methods. BMJ. 2025;388:e082505. +0002 +0003 **Authors:** Francisco Teixeira Barbosa^1,2*, Aritza Brizuela-Velasco^2, Daniel Robles Cantero^2 +0004 +0005 **Affiliations:** +0006 +0007 1. Foundation for Oral Rehabilitation (FOR), Werftestrasse 4, 6002 Luzern, Switzerland. +0008 2. DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University (UEMC), Padre Julio Chevalier 2, 47012 Valladolid, Spain. +0009 +0010 **Corresponding author:** Francisco Teixeira Barbosa, cisco@periospot.com +0011 +0012 - **Draft version:** BMC Oral Health submission preparation, June 2026 +0013 - **Target journal:** BMC Oral Health +0014 - **Article type:** Research article +0015 - **Reporting target:** TRIPOD+AI-aligned development and validation report +0016 +0017 ## Abstract +0018 +0019 **Background:** Machine-learning studies for periodontitis prediction have reported very high internal discrimination using low-cost NHANES predictors. Such estimates require careful reassessment because internal validation, missing-data handling, calibration, and treatment-seeking predictors can materially change apparent performance. +0020 +0021 **Methods:** We analyzed NHANES 2011-2014 adults age 30 years or older with full periodontal examinations (`n=9,034`). Periodontitis was classified using CDC/AAP case definitions. XGBoost, LightGBM, and CatBoost were compared using stratified 5-fold cross-validation. The primary calibrated ensemble excluded treatment-seeking variables and used 29 predictors. A secondary 33-feature model restored dental visit, flossing, loose teeth, and floss-missing variables to estimate their incremental contribution. The frozen primary model was evaluated on NHANES 2009-2010 (`n=5,037`) as same-source temporal validation. Missingness ablations and deployment-ready feature analysis were used to assess whether NHANES missingness patterns contributed survey-specific signal. +0022 +0023 **Results:** The primary 29-feature model achieved internal AUC-ROC 0.6896 and PR-AUC 0.8240. The secondary 33-feature model achieved AUC-ROC 0.6996 and PR-AUC 0.8295, indicating that treatment-seeking variables added 0.0100 AUC. In same-source temporal validation, the frozen primary model achieved AUC-ROC 0.6495, PR-AUC 0.7727, and Brier score 0.2023. At threshold 0.35, sensitivity was 98.9% and specificity was 5.5%; at threshold 0.65, sensitivity was 77.7% and specificity was 45.2%. These operating points are not diagnostic rules and require periodontal examination for confirmation. +0024 +0025 **Conclusions:** Low-cost NHANES predictors appear to support realistic discrimination around 0.69 internally and 0.65 under same-source temporal validation. The study is best interpreted as a benchmark and cautionary methods report, not as evidence that a deployable diagnostic screening model has been established. Geographic validation, prospective clinical validation, local recalibration, and subgroup calibration remain necessary before implementation claims. +0026 +0027 **Trial registration:** Not applicable. +0028 +0029 **Keywords:** periodontitis; NHANES; prediction model; gradient boosting; calibration; missing data; TRIPOD+AI +0030 +0031 ## Background +0032 +0033 Periodontitis is common among adults and is often identified only after irreversible tissue destruction has occurred. Low-cost risk stratification is attractive because full periodontal examination requires trained personnel, examination time, and access to dental care. However, a model built from demographic, behavioral, anthropometric, and metabolic predictors is not equivalent to clinical examination. +0034 +0035 Prior machine-learning work using NHANES has reported very high apparent discrimination. This manuscript reassesses the likely performance ceiling under stricter validation and reporting expectations. The central question is not whether a model can replace periodontal examination, but whether low-cost NHANES predictors contain enough signal to support reliable risk stratification and what methodological choices inflate or constrain that estimate. +0036 +0037 The objectives were to: +0038 +0039 1. Estimate realistic internal performance for modern gradient boosting models. +0040 2. Evaluate performance on an earlier NHANES cycle using a frozen model. +0041 3. Quantify the effect of treatment-seeking variables that may reflect existing disease. +0042 4. Assess missingness indicators and a deployment-ready feature set without NHANES-specific missingness flags. +0043 5. Reframe the model using prediction-model reporting standards and explicit limitations. +0044 +0045 ## Methods +0046 +0047 ### Study Design and Data Source +0048 +0049 This was a prediction-model benchmark using public NHANES data. The development cohort comprised NHANES 2011-2012 and 2013-2014. The same-source temporal validation cohort comprised NHANES 2009-2010. Later NHANES cycles were not used for temporal validation because full-mouth periodontal measurements required for CDC/AAP classification were discontinued. +0050 +0051 NHANES data are publicly available and de-identified. This secondary analysis did not require institutional review board approval. +0052 +0053 ### Participants +0054 +0055 Eligible participants were adults age 30 years or older with full periodontal examination data and sufficient information for CDC/AAP periodontitis classification. The development cohort included 9,034 participants. The same-source temporal validation cohort included 5,037 participants. +0056 +0057 The analytic prevalence of periodontitis was approximately 69-72% unweighted and approximately 66% after applying examination weights by cycle, higher than general-population CDC estimates. This reflects the restricted full-examination analytic sample and should not be interpreted as the expected prevalence in a lower-risk screening population. +0058 +0059 ### Outcome Definition +0060 +0061 The binary outcome was any periodontitis versus no periodontitis using CDC/AAP definitions. Severe, moderate, and mild classifications were assigned hierarchically from interproximal pocket depth and clinical attachment loss measurements, excluding third molars and enforcing different-teeth criteria where specified. +0062 +0063 The repository now includes synthetic tests for severe, moderate, mild, and no-periodontitis cases, third-molar exclusion, and different-teeth logic. +0064 +0065 ### Predictors and Feature Sets +0066 +0067 The primary model used 29 predictors after excluding treatment-seeking variables that can be downstream of disease: +0068 +0069 - `dental_visit` +0070 - `floss_days` +0071 - `mobile_teeth` +0072 - `floss_days_missing` +0073 +0074 The secondary model used 33 predictors by restoring those variables. The secondary model is reported only as an upper-bound sensitivity analysis, not as the preferred screening model. +0075 +0076 Predictor categories included demographics, smoking and alcohol variables, anthropometric measures, blood pressure, fasting glucose, triglycerides, HDL, and missingness indicators. +0077 +0078 ### Missing Data +0079 +0080 NHANES missingness is not purely random. Fasting laboratory variables are collected in subsamples, and questionnaire items may follow skip-pattern logic. Tree models were allowed to handle missing values natively, and missingness indicators were included in the primary model. A deployment-ready 15-feature model without missingness indicators was retained as a conservative lower-bound benchmark because NHANES-specific missingness may not transfer to clinical datasets. +0081 +0082 ### Model Development and Calibration +0083 +0084 Gradient boosting models were tuned using Optuna and evaluated with stratified 5-fold cross-validation. Monotonic constraints were applied to selected continuous variables where clinical priors were clear. Isotonic calibration was used for probability calibration in the cross-validation workflow. +0085 +0086 The final temporal evaluation used the frozen primary model and pre-specified thresholds from the development workflow. Thresholds were not re-optimized on the temporal validation cohort. +0087 +0088 ### Statistical Analysis +0089 +0090 Discrimination was summarized with AUC-ROC and PR-AUC. Calibration was summarized with Brier score and reliability plots. Operating points were reported using sensitivity, specificity, PPV, and NPV. Decision-curve analysis was included as a descriptive utility analysis only. +0091 +0092 Survey-weighted prevalence and subgroup performance are generated by `scripts/04_publication_analyses.py` when processed prediction tables are available. The current regenerated summary is saved in `results/publication_sensitivity_tables.md`. +0093 +0094 ## Results +0095 +0096 ### Cohorts +0097 +0098 The development cohort included 9,034 adults from NHANES 2011-2014. The temporal validation cohort included 5,037 adults from NHANES 2009-2010. Periodontitis prevalence was 70.9% in development data and 69.1% in temporal validation data before survey weighting. +0099 +0100 ### Internal Model Performance +0101 +0102 | Model variant | Features | AUC-ROC | PR-AUC | Interpretation | +0103 |---|---:|---:|---:|---| +0104 | Primary model without treatment-seeking variables | 29 | 0.6896 | 0.8240 | Preferred benchmark model | +0105 | Secondary full-feature model | 33 | 0.6996 | 0.8295 | Upper-bound sensitivity analysis | +0106 | Deployment-ready core model | 15 | 0.6896 | 0.8237 | Conservative feature set without missingness indicators | +0107 +0108 The full-feature model improved AUC by 0.0100 over the primary model. This small difference supports excluding treatment-seeking variables from the primary benchmark. +0109 +0110 ### Same-Source Temporal Validation +0111 +0112 | Metric | Development estimate | Temporal validation estimate | +0113 |---|---:|---:| +0114 | N | 9,034 | 5,037 | +0115 | Prevalence | 70.9% | 69.1% | +0116 | AUC-ROC | 0.6896 | 0.6495 (95% CI 0.6315-0.6664) | +0117 | PR-AUC | 0.8240 | 0.7727 (95% CI 0.7570-0.7885) | +0118 | Brier score | 0.1871 | 0.2023 (95% CI 0.1955-0.2085) | +0119 +0120 The drop from internal AUC 0.6896 to temporal AUC 0.6495 is consistent with a modest but meaningful generalization gap. +0121 +0122 ### Operating Points +0123 +0124 | Threshold | Sensitivity | Specificity | PPV | NPV | +0125 |---:|---:|---:|---:|---:| +0126 | 0.35 | 98.9% | 5.5% | 70.0% | 69.1% | +0127 | 0.65 | 77.7% | 45.2% | 76.0% | 47.5% | +0128 +0129 The 0.35 threshold prioritizes sensitivity but has very low specificity and an NPV of only 69.1% in this high-prevalence cohort. It should be described as a high-sensitivity triage operating point, not a reliable disease-exclusion rule. +0130 +0131 Figure 1 summarizes the discrimination, calibration error, temporal operating points, and primary-versus-secondary feature-set comparison. +0132 +0133 ![Figure 1. Model performance summary. Panel A shows internal and same-source temporal discrimination for the primary and secondary model variants. Panel B shows Brier score, where lower values indicate lower calibration error. Panel C shows sensitivity, specificity, PPV, and NPV for the frozen primary model in NHANES 2009-2010 at thresholds 0.35 and 0.65. Panel D shows that adding treatment-seeking variables increased the feature count from 29 to 33 and changed internal AUC by 0.0100.](figures/19_publication_performance_summary.png) +0134 +0135 ### Missingness and Survey-Design Sensitivity +0136 +0137 Missingness indicators contributed limited but measurable predictive signal. Missingness patterns were broadly comparable between the development and temporal validation cohorts, but the signal may still be NHANES-specific. The deployment-ready 15-feature model remains the most conservative estimate for settings where NHANES missingness patterns are unavailable. +0138 +0139 Survey-weighted prevalence and subgroup-performance tables were generated from processed prediction data and are saved in `results/publication_sensitivity_tables.md`. Weighted prevalence was approximately 65.6% in 2009-2010 and 66.2-66.3% in 2011-2014. Subgroup analyses are descriptive and should not be overinterpreted as evidence of transportability. +0140 +0141 Figure 2 summarizes weighted prevalence, selected subgroup AUCs, and the highest missingness proportions among predictors. +0142 +0143 ![Figure 2. Survey sensitivity summary. Panel A compares unweighted and survey-weighted periodontitis prevalence by NHANES cycle. Panel B shows temporal AUC-ROC across selected age, sex, smoking, and metabolic-risk subgroups, with the overall temporal AUC shown as a dashed reference line. Panel C shows the predictors with the highest missingness proportions, highlighting the fasting laboratory variables as the dominant source of missingness.](figures/20_publication_sensitivity_summary.png) +0144 +0145 ## Discussion +0146 +0147 This repair changes the interpretation of the study. The main contribution is not a clinically ready model. The contribution is a reproducible benchmark showing that realistic performance for low-cost periodontitis predictors is far below highly optimistic internal estimates reported in some prior work. +0148 +0149 The same-source temporal validation result is important but limited. Because both cohorts come from NHANES, performance does not establish geographic transportability, prospective clinical performance, or behavior in lower-prevalence screening populations. +0150 +0151 The operating points also require conservative interpretation. High sensitivity at threshold 0.35 comes at the cost of low specificity, and the NPV does not support reassuring individual patients without periodontal examination. +0152 +0153 ## Limitations +0154 +0155 1. The validation cohort is temporally distinct but not independent by geography, health system, or measurement program. +0156 2. The analytic cohort is restricted to adults with full periodontal examination data and has high disease prevalence. +0157 3. NHANES missingness indicators may encode survey logistics rather than transportable clinical information. +0158 4. Survey-weighted and subgroup calibration tables are descriptive and should be interpreted carefully. +0159 5. The model predicts current case status, not future incident periodontitis. +0160 6. The manuscript does not establish patient benefit, treatment impact, or workflow safety. +0161 +0162 ## Conclusions +0163 +0164 The primary 29-feature model achieved AUC-ROC 0.6896 internally and 0.6495 under same-source temporal validation. A secondary 33-feature model provided only a small apparent gain, supporting exclusion of treatment-seeking variables from the preferred benchmark. These results support a cautious methodological conclusion: low-cost NHANES predictors provide moderate discrimination and are useful for benchmarking, but they do not establish a clinically ready periodontitis screening system. +0165 +0166 ## List of abbreviations +0167 +0168 AUC-ROC: area under the receiver operating characteristic curve; Brier: Brier score; CDC: Centers for Disease Control and Prevention; CI: confidence interval; FOR: Foundation for Oral Rehabilitation; NHANES: National Health and Nutrition Examination Survey; NCHS: National Center for Health Statistics; NPV: negative predictive value; PPV: positive predictive value; PR-AUC: area under the precision-recall curve; PROBAST+AI: Prediction model Risk Of Bias ASsessment Tool plus artificial intelligence; TRIPOD+AI: Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis plus artificial intelligence; UEMC: Miguel de Cervantes European University. +0169 +0170 ## Declarations +0171 +0172 ### Ethics approval and consent to participate +0173 +0174 NHANES protocols were approved by the NCHS Research Ethics Review Board, and NHANES participants provided informed consent [8]. This secondary analysis used publicly available de-identified NHANES data and did not require additional local ethics approval. +0175 +0176 ### Consent for publication +0177 +0178 Not applicable. The manuscript does not include individual person-level identifying data, images, or videos. +0179 +0180 ### Availability of data and materials +0181 +0182 The datasets analyzed during the current study are publicly available from the CDC/NCHS NHANES website [7]. Code, tests, scripts, generated figures, and saved result artifacts are available in the project repository [6]. +0183 +0184 ### Competing interests +0185 +0186 FTB is Executive Director of the Foundation for Oral Rehabilitation and founder/editor-in-chief of PerioSpot. ABV and DRC are affiliated with DENS-ia Research Group, Faculty of Health Sciences, Miguel de Cervantes European University. The authors report no financial competing interests directly related to the NHANES data, code, or periodontitis prediction model. +0187 +0188 ### Funding +0189 +0190 No external funding was reported for this analysis. +0191 +0192 ### Authors' contributions +0193 +0194 FTB conceived the study, implemented and verified the reproducible analysis workflow, generated figures, interpreted results, and drafted the manuscript. ABV contributed clinical and methodological interpretation, reviewed the oral-health framing, and reviewed the manuscript. DRC contributed clinical interpretation, reviewed the oral-health framing, and reviewed the manuscript. All authors approved the submitted manuscript. +0195 +0196 ### Acknowledgements +0197 +0198 The authors acknowledge the Centers for Disease Control and Prevention, the National Center for Health Statistics, and the NHANES participants and staff who made the public-use data available. +0199 +0200 ### Authors' information +0201 +0202 Not applicable. +0203 +0204 ### AI-use disclosure +0205 +0206 AI systems were used for drafting support, code review, figure-label review, and critique generation during manuscript development. No AI system is listed as an author. The authors reviewed and remain responsible for all analysis decisions, code changes, interpretation, and final claims. +0207 +0208 ## References +0209 +0210 1. Eke PI, Page RC, Wei L, Thornton-Evans G, Genco RJ. Update of the case definitions for population-based surveillance of periodontitis. J Periodontol. 2012;83(12):1449-1454. +0211 2. Eke PI, Dye BA, Wei L, et al. Update on prevalence of periodontitis in adults in the United States: NHANES 2009-2012. J Periodontol. 2015;86(5):611-622. +0212 3. Bashir NZ, Rahman Z, Chen SLS. Systematic comparison of machine learning algorithms to develop and validate predictive models for periodontitis. J Clin Periodontol. 2022;49:958-969. +0213 4. Collins GS, Moons KGM, Dhiman P, et al. TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods. BMJ. 2024;385:e078378. +0214 5. Moons KGM, Damen JAA, Kaul T, et al. PROBAST+AI: an updated quality, risk of bias, and applicability assessment tool for prediction models using regression or artificial intelligence methods. BMJ. 2025;388:e082505. +0215 6. Barbosa FT, Brizuela-Velasco A, Robles Cantero D. NHANES Periodontitis Prediction Benchmark. GitHub. 2026. https://github.com/Tuminha/NHANES-Periodontitis-Machine-Learning-Project. Accessed 4 Jun 2026. +0216 7. National Center for Health Statistics. National Health and Nutrition Examination Survey. Centers for Disease Control and Prevention. https://www.cdc.gov/nchs/nhanes/. Accessed 4 Jun 2026. +0217 8. National Center for Health Statistics. Ethics Review Board Approval. Centers for Disease Control and Prevention. https://www.cdc.gov/nchs/nhanes/about/erb.html. Accessed 4 Jun 2026. diff --git a/scripts/05_number_manuscript_lines.py b/scripts/05_number_manuscript_lines.py index 817b56d..e09c1eb 100644 --- a/scripts/05_number_manuscript_lines.py +++ b/scripts/05_number_manuscript_lines.py @@ -20,7 +20,10 @@ def main() -> None: output.parent.mkdir(parents=True, exist_ok=True) lines = source.read_text(encoding="utf-8").splitlines() - numbered = [f"{idx:04d} {line}" for idx, line in enumerate(lines, start=1)] + numbered = [ + f"{idx:04d} {line}" if line else f"{idx:04d}" + for idx, line in enumerate(lines, start=1) + ] output.write_text("\n".join(numbered) + "\n", encoding="utf-8") print(f"Wrote {output}") diff --git a/scripts/06_generate_publication_figures.py b/scripts/06_generate_publication_figures.py new file mode 100644 index 0000000..07c4c3f --- /dev/null +++ b/scripts/06_generate_publication_figures.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +"""Generate manuscript figures from canonical publication result artifacts.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + + +ROOT = Path(__file__).resolve().parents[1] +FIGURES = ROOT / "figures" + +BLUE = "#15365a" +LIGHT_BLUE = "#2f80b7" +RED = "#8f2d2d" +GOLD = "#c28b12" +GREEN = "#2f7d42" +GRAY = "#6b7280" +LIGHT_GRAY = "#e5e7eb" + + +def load_json(path: str) -> dict: + with (ROOT / path).open("r", encoding="utf-8") as f: + return json.load(f) + + +def set_style() -> None: + plt.rcParams.update( + { + "font.family": "DejaVu Sans", + "font.size": 10, + "axes.titlesize": 12, + "axes.labelsize": 10, + "xtick.labelsize": 9, + "ytick.labelsize": 9, + "legend.fontsize": 9, + "figure.dpi": 150, + "savefig.dpi": 300, + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.color": LIGHT_GRAY, + "grid.linewidth": 0.7, + } + ) + + +def label_bars(ax: plt.Axes, bars, fmt: str = "{:.3f}", offset: float = 0.01) -> None: + for bar in bars: + height = bar.get_height() + ax.text( + bar.get_x() + bar.get_width() / 2, + height + offset, + fmt.format(height), + ha="center", + va="bottom", + fontsize=8, + ) + + +def label_horizontal_bars(ax: plt.Axes, bars, fmt: str = "{:.1f}%") -> None: + for bar in bars: + width = bar.get_width() + ax.text( + width + 0.01, + bar.get_y() + bar.get_height() / 2, + fmt.format(width * 100), + ha="left", + va="center", + fontsize=8, + ) + + +def save_figure(fig: plt.Figure, stem: str) -> None: + FIGURES.mkdir(parents=True, exist_ok=True) + fig.savefig(FIGURES / f"{stem}.png", bbox_inches="tight") + fig.savefig(FIGURES / f"{stem}.pdf", bbox_inches="tight") + plt.close(fig) + + +def model_rows() -> list[dict]: + primary = load_json("results/v13_primary_norc_summary.json") + secondary = load_json("results/v13_secondary_full_summary.json") + temporal = load_json("results/external_0910_metrics.json") + return [ + { + "label": "Primary internal\n29 features", + "auc": primary["metrics"]["auc_roc"], + "pr_auc": primary["metrics"]["pr_auc"], + "brier": primary["metrics"]["brier_score"], + "features": primary["n_features"], + }, + { + "label": "Secondary internal\n33 features", + "auc": secondary["metrics"]["auc_roc"], + "pr_auc": secondary["metrics"]["pr_auc"], + "brier": secondary["metrics"]["brier_score"], + "features": secondary["n_features"], + }, + { + "label": "Same-source temporal\nfrozen primary", + "auc": temporal["metrics"]["auc"]["mean"], + "pr_auc": temporal["metrics"]["prauc"]["mean"], + "brier": temporal["metrics"]["brier"]["mean"], + "features": primary["n_features"], + }, + ] + + +def plot_performance_summary() -> None: + rows = model_rows() + temporal = load_json("results/external_0910_metrics.json") + operating = temporal["operating_points"] + + fig, axes = plt.subplots(2, 2, figsize=(12, 8)) + fig.suptitle("Publication model performance summary", fontsize=15, fontweight="bold", y=0.99) + + labels = [row["label"] for row in rows] + x = np.arange(len(labels)) + width = 0.36 + + ax = axes[0, 0] + auc_bars = ax.bar(x - width / 2, [row["auc"] for row in rows], width, label="AUC-ROC", color=BLUE) + pr_bars = ax.bar(x + width / 2, [row["pr_auc"] for row in rows], width, label="PR-AUC", color=LIGHT_BLUE) + ax.set_title("A. Discrimination") + ax.set_ylabel("Metric value") + ax.set_xticks(x) + ax.set_xticklabels(labels) + ax.set_ylim(0, 1.02) + ax.legend(loc="lower right") + label_bars(ax, auc_bars, offset=0.015) + label_bars(ax, pr_bars, offset=0.015) + + ax = axes[0, 1] + brier_bars = ax.bar(x, [row["brier"] for row in rows], color=[BLUE, LIGHT_BLUE, RED]) + ax.set_title("B. Calibration error") + ax.set_ylabel("Brier score, lower is better") + ax.set_xticks(x) + ax.set_xticklabels(labels) + ax.set_ylim(0, 0.25) + label_bars(ax, brier_bars, offset=0.004) + + ax = axes[1, 0] + metrics = ["sensitivity", "specificity", "ppv", "npv"] + metric_labels = ["Sensitivity", "Specificity", "PPV", "NPV"] + op_labels = ["t = 0.35", "t = 0.65"] + values = [ + [operating["rule_out_t_0.35"][metric] for metric in metrics], + [operating["balanced_t_0.65"][metric] for metric in metrics], + ] + metric_x = np.arange(len(metrics)) + op_width = 0.36 + bars_035 = ax.bar(metric_x - op_width / 2, values[0], op_width, label=op_labels[0], color=GREEN) + bars_065 = ax.bar(metric_x + op_width / 2, values[1], op_width, label=op_labels[1], color=GOLD) + ax.set_title("C. Same-source temporal operating points") + ax.set_ylabel("Proportion") + ax.set_xticks(metric_x) + ax.set_xticklabels(metric_labels) + ax.set_ylim(0, 1.08) + ax.legend(loc="upper right") + label_bars(ax, bars_035, "{:.2f}", 0.015) + label_bars(ax, bars_065, "{:.2f}", 0.015) + + ax = axes[1, 1] + feature_counts = [rows[0]["features"], rows[1]["features"]] + auc_gain = rows[1]["auc"] - rows[0]["auc"] + bars = ax.bar(["Primary", "Secondary"], feature_counts, color=[BLUE, LIGHT_BLUE]) + ax.set_title("D. Feature-set comparison") + ax.set_ylabel("Number of predictors") + ax.set_ylim(0, 40) + for bar, count in zip(bars, feature_counts): + ax.text( + bar.get_x() + bar.get_width() / 2, + count + 1, + str(count), + ha="center", + va="bottom", + fontsize=9, + ) + ax.text( + 0.5, + 0.18, + f"Adding treatment-seeking variables changed AUC by {auc_gain:.4f}.", + ha="center", + va="center", + transform=ax.transAxes, + fontsize=10, + bbox={"boxstyle": "round,pad=0.4", "facecolor": "#f8fafc", "edgecolor": LIGHT_GRAY}, + ) + + fig.tight_layout(rect=(0, 0, 1, 0.97)) + save_figure(fig, "19_publication_performance_summary") + + +def subgroup_label(row: pd.Series) -> str: + variable = row["subgroup_variable"] + subgroup = row["subgroup"] + if variable == "sex": + subgroup = "Male" if float(subgroup) == 1.0 else "Female" + return f"Sex: {subgroup}" + if variable == "age_group": + return f"Age: {subgroup}" + if variable == "smoking": + subgroup = str(subgroup).replace("never/unknown", "never or unknown") + return f"Smoking: {subgroup}" + elif variable == "metabolic_risk": + subgroup = str(subgroup).replace("_", " ") + return f"Metabolic risk: {subgroup}" + return f"{variable.replace('_', ' ').title()}: {subgroup}" + + +def plot_sensitivity_summary() -> None: + payload = load_json("results/publication_sensitivity_tables.json") + temporal = load_json("results/external_0910_metrics.json") + prevalence = pd.DataFrame(payload["prevalence_by_cycle"]) + subgroup = pd.DataFrame(payload["subgroup_performance"]) + missingness = pd.DataFrame(payload["missingness"]).sort_values("missing_pct", ascending=False).head(7) + + selected_variables = {"age_group", "sex", "smoking", "metabolic_risk"} + subgroup = subgroup[subgroup["subgroup_variable"].isin(selected_variables)].copy() + subgroup["label"] = subgroup.apply(subgroup_label, axis=1) + variable_order = {"age_group": 0, "sex": 1, "smoking": 2, "metabolic_risk": 3} + subgroup["variable_order"] = subgroup["subgroup_variable"].map(variable_order) + subgroup = subgroup.sort_values(["variable_order", "subgroup"]) + + fig, axes = plt.subplots(1, 3, figsize=(14, 5)) + fig.suptitle("Survey and subgroup sensitivity summary", fontsize=15, fontweight="bold", y=1.03) + + ax = axes[0] + x = np.arange(len(prevalence)) + width = 0.36 + unweighted = ax.bar( + x - width / 2, + prevalence["unweighted_prevalence"], + width, + color=BLUE, + label="Unweighted", + ) + weighted = ax.bar( + x + width / 2, + prevalence["weighted_prevalence"], + width, + color=LIGHT_BLUE, + label="Survey-weighted", + ) + ax.set_title("A. Periodontitis prevalence") + ax.set_ylabel("Prevalence") + ax.set_xticks(x) + ax.set_xticklabels(prevalence["cycle"], rotation=20, ha="right") + ax.set_ylim(0, 0.82) + ax.legend(loc="upper right") + label_bars(ax, unweighted, "{:.2f}", 0.015) + label_bars(ax, weighted, "{:.2f}", 0.015) + + ax = axes[1] + y = np.arange(len(subgroup)) + bars = ax.barh(y, subgroup["auc"], color=BLUE) + ax.axvline(temporal["metrics"]["auc"]["mean"], color=RED, linestyle="--", linewidth=1.2, label="Overall temporal AUC") + ax.set_title("B. Temporal AUC by subgroup") + ax.set_xlabel("AUC-ROC") + ax.set_yticks(y) + ax.set_yticklabels(subgroup["label"]) + ax.set_xlim(0.5, 0.75) + ax.invert_yaxis() + ax.legend(loc="lower right") + for bar in bars: + ax.text(bar.get_width() + 0.004, bar.get_y() + bar.get_height() / 2, f"{bar.get_width():.3f}", va="center", fontsize=8) + + ax = axes[2] + missingness = missingness.iloc[::-1] + bars = ax.barh(np.arange(len(missingness)), missingness["missing_pct"], color=GRAY) + ax.set_title("C. Highest feature missingness") + ax.set_xlabel("Missing proportion") + ax.set_yticks(np.arange(len(missingness))) + ax.set_yticklabels(missingness["feature"]) + ax.set_xlim(0, max(0.65, float(missingness["missing_pct"].max()) + 0.08)) + label_horizontal_bars(ax, bars) + + fig.tight_layout() + save_figure(fig, "20_publication_sensitivity_summary") + + +def main() -> None: + set_style() + plot_performance_summary() + plot_sensitivity_summary() + print("Generated publication figures:") + for name in [ + "19_publication_performance_summary.png", + "19_publication_performance_summary.pdf", + "20_publication_sensitivity_summary.png", + "20_publication_sensitivity_summary.pdf", + ]: + print(f" figures/{name}") + + +if __name__ == "__main__": + main() diff --git a/scripts/verify_submission.py b/scripts/verify_submission.py index c3c427a..b6d83bf 100644 --- a/scripts/verify_submission.py +++ b/scripts/verify_submission.py @@ -87,6 +87,51 @@ def check_publication_analysis_outputs() -> None: raise AssertionError(f"Publication subgroup output missing strata: {sorted(missing)}") +def check_publication_figures() -> None: + expected = [ + ROOT / "figures/19_publication_performance_summary.png", + ROOT / "figures/19_publication_performance_summary.pdf", + ROOT / "figures/20_publication_sensitivity_summary.png", + ROOT / "figures/20_publication_sensitivity_summary.pdf", + ] + for path in expected: + if not path.exists() or path.stat().st_size < 5_000: + raise AssertionError(f"Publication figure missing or unexpectedly small: {path.relative_to(ROOT)}") + + +def check_bmc_manuscript_sections() -> None: + manuscript = ROOT / "docs/publication/ARTICLE_DRAFT.md" + text = manuscript.read_text(encoding="utf-8") + required = [ + "**Authors:** Francisco Teixeira Barbosa", + "**Corresponding author:**", + "**Trial registration:** Not applicable.", + "![Figure 1. Model performance summary.", + "![Figure 2. Survey sensitivity summary.", + "## List of abbreviations", + "## Declarations", + "### Ethics approval and consent to participate", + "### Consent for publication", + "### Availability of data and materials", + "### Competing interests", + "### Funding", + "### Authors' contributions", + "### Acknowledgements", + ] + missing = [needle for needle in required if needle not in text] + if missing: + raise AssertionError(f"BMC manuscript sections missing: {missing}") + + abstract = text.split("## Abstract", 1)[1].split("## Background", 1)[0] + abstract_words = [ + word + for word in abstract.replace("**", "").replace("`", "").split() + if word.strip() + ] + if len(abstract_words) > 350: + raise AssertionError(f"BMC abstract exceeds 350 words: {len(abstract_words)}") + + def check_publication_wording() -> None: banned = ["Publication Ready", "External Validation", "clinical deployment", "negative result rules out"] for path in PUBLICATION_FILES: @@ -124,6 +169,8 @@ def main() -> None: check_reproduction_hooks() check_temporal_metric_shape() check_publication_analysis_outputs() + check_publication_figures() + check_bmc_manuscript_sections() check_publication_wording() check_nhanes_urls() check_manuscript_render_support()