<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Bioinform Biotech</journal-id><journal-id journal-id-type="publisher-id">bioinform</journal-id><journal-id journal-id-type="index">19</journal-id><journal-title>JMIR Bioinformatics and Biotechnology</journal-title><abbrev-journal-title>JMIR Bioinform Biotech</abbrev-journal-title><issn pub-type="epub">2563-3570</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v7i1e85659</article-id><article-id pub-id-type="doi">10.2196/85659</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Temporal Reproducibility of a Genetic Algorithm&#x2013;Derived Health Risk Score: Standardized Out-of-Fold Validation Framework (2021-2023)</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Aoki</surname><given-names>Yoichiro</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Takeda</surname><given-names>Hiroki</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yokota</surname><given-names>Kinichi</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yoshida</surname><given-names>Ryoko</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Yoshida Hospital-Keiyukai Medical Corporation</institution><addr-line>1-2, Nishi 4-chome, 4-jyo</addr-line><addr-line>Asahikawa</addr-line><addr-line>Hokkaido</addr-line><country>Japan</country></aff><aff id="aff2"><institution>Department of Cardiovascular Medicine, Yoshida Hospital- Keiyukai Medical Corporation</institution><addr-line>Asahikawa</addr-line><addr-line>Hokkaido</addr-line><country>Japan</country></aff><aff id="aff3"><institution>Department of Gastroenterology, Yoshida Hospital-Keiyukai Medical Corporation</institution><addr-line>Asahikawa</addr-line><addr-line>Hokkaido</addr-line><country>Japan</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Yue</surname><given-names>Zongliang</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Khamesipour</surname><given-names>Faham</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Shannawaz</surname><given-names>Mohd</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yoichiro Aoki, MD, PhD, Yoshida Hospital-Keiyukai Medical Corporation, 1-2, Nishi 4-chome, 4-jyo, Asahikawa, Hokkaido, 070-0054, Japan, 81 166-23-0685; <email>y-aoki@keiyukai-group.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>21</day><month>4</month><year>2026</year></pub-date><volume>7</volume><elocation-id>e85659</elocation-id><history><date date-type="received"><day>14</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>28</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>13</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yoichiro Aoki, Hiroki Takeda, Kinichi Yokota, Ryoko Yoshida. Originally published in JMIR Bioinformatics and Biotechnology (<ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org">https://bioinform.jmir.org</ext-link>), 21.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">http://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Bioinformatics and Biotechnology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org/">https://bioinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://bioinform.jmir.org/2026/1/e85659"/><abstract><sec><title>Background</title><p>Genetic algorithm (GA)&#x2013;based scoring has been proposed as a data-driven approach for health risk stratification . However, performance estimates may be inflated when preprocessing, optimization, and evaluation are not strictly separated within a prespecified validation framework. Demonstrating temporal reproducibility under a standardized out-of-fold (OOF) evaluation framework with transparent uncertainty quantification is therefore essential for ensuring translational reliability in preventive health screening.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the temporal reproducibility of a GA-derived composite health risk score across three consecutive annual cohorts (2021&#x2010;2023) under a standardized OOF validation pipeline and to assess robustness to policy-driven structural HbA<sub>1c</sub> missingness through a prespecified ON/OFF sensitivity analysis.</p></sec><sec sec-type="methods"><title>Methods</title><p>Annual health examination datasets from 2021 (n=3744), 2022 (n=5153), and 2023 (n=5352) were analyzed using an identical preprocessing and modeling pipeline. Thirteen clinical indicators and eight lifestyle questionnaire variables were included as predictors. The outcome was based on an A&#x2013;D grading framework and binarized using an OR rule across domains (grade &#x2265;B in any domain). Continuous variables were median-imputed and standardized within each training fold to prevent information leakage. GA optimization was performed using fixed random seeds, and fitness estimation employed stratified K-fold cross-validation. Predicted probabilities were obtained by fitting logistic regression models to GA-derived composite scores within the OOF framework. Discrimination and overall predictive performance were quantified using the area under the receiver operating characteristic curve (AUC) and the Brier score calculated from OOF predicted probabilities. Uncertainty was estimated using 2,000-replicate percentile bootstrap resampling. A prespecified sensitivity analysis excluded HbA<sub>1c</sub> while maintaining an identical evaluation framework.</p></sec><sec sec-type="results"><title>Results</title><p>OOF AUC values were stable across cohorts (2021: 0.810; 2022: 0.814; 2023: 0.812), with overlapping 95% percentile bootstrap confidence intervals. Brier scores ranged from 0.172 to 0.176. Exclusion of HbA<sub>1c</sub> resulted in small changes in discrimination (median &#x0394;AUC was &#x2264;0.007), consistent with the prespecified ON/OFF sensitivity analysis.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Under a harmonized OOF validation framework, the GA-derived composite risk score showed stable temporal discrimination and consistent overall predictive performance across three consecutive annual cohorts. These findings underscore the methodological importance of prespecified, standardized evaluation procedures and transparent uncertainty quantification when assessing reproducibility of risk stratification models in routine health screening data.</p></sec></abstract><kwd-group><kwd>genetic algorithm</kwd><kwd>health risk scoring</kwd><kwd>reproducibility</kwd><kwd>cross-validation</kwd><kwd>ROC</kwd><kwd>AUC</kwd><kwd>preventive medicine</kwd><kwd>area under the receiver operating characteristic curve</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In preventive health screening, risk classification commonly relies on threshold-based evaluation of individual clinical indicators (eg, blood pressure, lipids, or HbA<sub>1c</sub>). While such approaches ensure procedural uniformity, they do not integrate multiple biological and lifestyle dimensions into a composite risk representation. Data-driven optimization approaches have therefore been proposed to enhance structural consistency and interpretability of health risk scoring.</p><p>Genetic algorithms (GAs), originally introduced by Holland [<xref ref-type="bibr" rid="ref1">1</xref>] and further formalized by Goldberg [<xref ref-type="bibr" rid="ref2">2</xref>], provide a flexible framework for feature weighting and optimization under cross-validated conditions. However, GA-based scoring models applied to real-world health checkup data require careful attention to reproducibility, methodological harmonization, and internal validation procedures. In particular, performance estimates may vary depending on whether preprocessing, optimization, and evaluation steps are strictly separated within a prespecified validation framework.</p><p>Bayesian estimation can be incorporated as an interpretive layer to express predicted risk probabilistically, aligning composite scores with calibrated predicted probabilities. Rather than emphasizing peak discrimination, evaluating temporal reproducibility under a standardized analytical framework is essential for ensuring methodological consistency.</p><p>The objective of this study was to evaluate the temporal reproducibility of a GA-derived composite health risk score across three consecutive annual cohorts (2021&#x2010;2023) using a prespecified, standardized out-of-fold validation pipeline.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Data Source and Participants</title><p>We analyzed a deidentified dataset from annual health checkups conducted at the Preventive Medicine Center (Ningen Dock Division), Yoshida Hospital, Keiyukai Medical Corporation (Asahikawa, Hokkaido, Japan). The analytic cohorts comprised examinees from 2021 (n=3744), 2022 (n=5153), and 2023 (n=5352), each analyzed as an independent annual cohort.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The study was approved by the Institutional Review Board of Yoshida Hospital, Keiyukai Medical Corporation (Approval No. 20251002001) and conducted in accordance with the Declaration of Helsinki. Data were deidentified prior to analysis. Written informed consent, including consent for secondary use of de-identified data, was obtained at the time of the health checkup. No additional interventions or participant contact occurred as part of this study.</p></sec><sec id="s2-3"><title>Measures and Preprocessing</title><p>Thirteen routine clinical indicators were included: BMI, waist circumference, systolic blood pressure, diastolic blood pressure, fasting plasma glucose, hemoglobin A1c (HbA<sub>1c</sub>), triglycerides, high-density lipoprotein cholesterol, low-density lipoprotein cholesterol, aspartate aminotransferase, alanine aminotransferase, &#x03B3;-glutamyl transferase, and uric acid. Sex-specific thresholds were applied for waist circumference according to institutional criteria (see Table S2B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Eight lifestyle questionnaire items were included (eg, smoking, alcohol consumption, breakfast habits, snacking, eating speed, mastication, physical activity/walking, and motivation for health improvement). Lifestyle questionnaire items were coded as binary variables according to the facility codebook. Variable definitions are provided in Table S2A in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Continuous variables were median-imputed and standardized within each training fold of each annual cohort. The imputation and standardization parameters were estimated within the training fold and applied to the corresponding held-out fold to prevent information leakage.</p><p>Indicator-level missingness rates are summarized in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Missingness in indicators other than HbA<sub>1c</sub> was low (&#x003C;2% in each y).</p></sec><sec id="s2-4"><title>Outcome Definition</title><p>The primary outcome was defined as a composite abnormality label derived from routine health-check classification rules used in the screening program. For each clinical domain (eg, glucose metabolism, blood pressure, lipids, liver enzymes, and anthropometric indices), examinees were categorized according to prespecified threshold-based grades (A&#x2013;D).</p><p>In the institutional screening system, grade A indicates no abnormality; grade B indicates mild abnormality typically requiring lifestyle guidance; grade C indicates follow-up or re-evaluation; and grade D indicates recommendation for further diagnostic evaluation or treatment.</p><p>The composite outcome was binarized using an OR rule: participants were labeled outcome-positive if any domain met or exceeded the predefined abnormality threshold (grade B or higher); otherwise, they were labeled outcome-negative. This definition was applied consistently across all annual cohorts to evaluate structural reproducibility of the operational framework rather than severity-specific prognostic discrimination. Detailed domain-specific thresholds corresponding to grade B or higher are provided in Table S2B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The grading thresholds (A&#x2013;D) were defined according to the standardized health-check classification framework established by the Japanese Society of Ningen Dock and Preventive Medicine, which is based on national health screening standards and specialty society guidelines. These classifications are widely used in routine health-check programs across Japan to guide follow-up recommendations (eg, observation, repeat testing, referral, or treatment). This study adopted these externally defined operational criteria without modification and dichotomized grade B or higher to capture any clinically relevant abnormality that warrants structured follow-up under this program. Participants categorized as grade E (under active treatment) were excluded from the outcome classification process.</p></sec><sec id="s2-5"><title>Handling of HbA1c Structural Missingness</title><p>HbA1c was structurally missing for a subset of participants due to the program&#x2019;s screening policy-based test selection. Indicator-level missingness rates are summarized in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>Because the composite outcome was defined using an OR rule across multiple domains, outcome ascertainment did not depend solely on HbA<sub>1c</sub>.</p><p>To evaluate robustness to structural HbA1c missingness, we conducted a prespecified sensitivity analysis excluding HbA1c from the predictor set while maintaining an identical evaluation pipeline (HbA<sub>1c</sub> included vs excluded).</p></sec><sec id="s2-6"><title>Model Development and Evaluation</title><p>A composite score was generated from standardized features using a genetic algorithm (GA). All stochastic components were controlled by fixing the random seed (SEED=42) for both Python&#x2019;s random module and NumPy. Fitness estimation used stratified K-fold cross-validation with shuffling (random_state=42).</p><p>The GA-derived composite score was subsequently entered into a logistic regression model to generate calibrated predicted probabilities within a prespecified out-of-fold (OOF) validation framework (Platt scaling [<xref ref-type="bibr" rid="ref3">3</xref>]).</p><p>Bayesian updating was applied post hoc for interpretability purposes to the calibrated predicted probabilities and did not influence GA optimization or probability estimation.</p><p>For each fold, the model was trained on the training subset and evaluated on the corresponding held-out subset. OOF predictions were aggregated across folds to obtain a single internally validated prediction for each participant within each annual cohort.</p></sec><sec id="s2-7"><title>Discrimination and Calibration</title><p>Model discrimination was assessed using the area under the receiver operating characteristic curve (AUC) calculated from OOF predicted probabilities generated within the prespecified cross-validated pipeline.</p><p>Overall predictive performance was quantified using the OOF-based Brier score. Calibration was examined descriptively using calibration plots based on OOF predicted probabilities (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). No additional recalibration or threshold optimization was performed beyond the prespecified validation framework.</p></sec><sec id="s2-8"><title>Bootstrap Uncertainty Estimation</title><p>To improve statistical transparency, 95% percentile bootstrap confidence intervals for OOF AUC and Brier score were computed using 2000 participant-level resamples within each annual cohort and HbA<sub>1c</sub> condition (ON/OFF). Performance metrics were recalculated from the previously generated OOF predicted probabilities without refitting the model, thereby preserving internal validation and avoiding information leakage while maintaining the integrity of the original cross-validated predictions.</p><p>Year-stratified OOF performance estimates and confidence intervals are reported in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and prespecified ON&#x2013;OFF differences are summarized in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-9"><title>Statistical Software</title><p>All analyses were performed in Python (version 3.13.5) using <italic>scikit-learn</italic> (version 1.6.1) [<xref ref-type="bibr" rid="ref4">4</xref>] and DEAP (version 1.4) [<xref ref-type="bibr" rid="ref5">5</xref>]. Analyses were conducted in a Jupyter-based environment (Anaconda distribution). Detailed GA configuration parameters, including population size, number of generations, weight initialization range, crossover and mutation settings, and random-seed control, are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (&#x201C;Genetic Algorithm Implementation&#x201D;) to facilitate reproducibility.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Discrimination and Overall Predictive Performance Under Harmonized Out-of-Fold (OOF) Validation</title><p>Across the three annual cohorts, sample sizes ranged from 3744 to 5352 individuals, with outcome prevalence between 36.7% and 37.9%, indicating comparable class balance across years.</p><p>Under the standardized OOF validation framework, discrimination remained stable across cohorts. OOF AUC values were 0.810 (2021), 0.814 (2022), and 0.812 (2023), with overlapping 95 % bootstrap confidence intervals (<xref ref-type="table" rid="table1">Table 1</xref>). Brier scores ranged from 0.172 to 0.176 across cohorts, indicating stable overall predictive performance across years (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Discrimination and overall predictive performance under harmonized out-of-fold (OOF) validation (2021&#x2010;2023; primary model with HbA1c included). AUC and Brier score were calculated exclusively from OOF predicted probabilities generated within the prespecified cross-validation pipeline. Values in parentheses represent 95% percentile bootstrap confidence intervals based on 2000 resamples. Calibration plots based on OOF predicted probabilities are provided in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The prespecified HbA<sub>1c</sub> ON/OFF sensitivity analysis and ON&#x2212;OFF differences are summarized in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Year</td><td align="left" valign="bottom">Individuals, n</td><td align="left" valign="bottom">Outcome prevalence</td><td align="left" valign="bottom">OOF<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> AUC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (95% CI)</td><td align="left" valign="bottom">Brier score (95% CI)</td></tr></thead><tbody><tr><td align="char" char="." valign="top">2021</td><td align="char" char="." valign="top">3744</td><td align="char" char="." valign="top">0.375</td><td align="char" char="." valign="top">0.810 (0.794&#x2010;0.820)</td><td align="char" char="." valign="top">0.176 (0.170&#x2010;0.183)</td></tr><tr><td align="char" char="." valign="top">2022</td><td align="char" char="." valign="top">5153</td><td align="char" char="." valign="top">0.379</td><td align="char" char="." valign="top">0.814 (0.802&#x2010;0.825)</td><td align="char" char="." valign="top">0.173 (0.168&#x2010;0.178)</td></tr><tr><td align="char" char="." valign="top">2023</td><td align="char" char="." valign="top">5352</td><td align="char" char="." valign="top">0.367</td><td align="char" char="." valign="top">0.812 (0.800&#x2010;0.824)</td><td align="char" char="." valign="top">0.172 (0.166&#x2010;0.177)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>OOF: out-of-fold.</p></fn><fn id="table1fn2"><p><sup>b</sup>AUC: area under the receiver operating characteristic curve. </p></fn></table-wrap-foot></table-wrap><p>Corresponding calibration plots are provided in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>In the prespecified HbA<sub>1c</sub> ON/OFF sensitivity analysis, exclusion of HbA1c resulted in minimal changes in discrimination and Brier score (Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), suggesting limited sensitivity of model performance to policy-driven structural HbA1c missingness under the harmonized OOF validation framework.</p><p>As the primary objective was to assess temporal reproducibility under standardized analytical procedures, formal hypothesis testing of between-year differences was not performed.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study evaluated the temporal reproducibility of a genetic algorithm (GA)&#x2013;derived composite health risk score across three consecutive annual health checkup cohorts under a prespecified OOF validation framework. Cross-validated OOF AUC values ranged from 0.810 to 0.814, with overlapping bootstrap confidence intervals, indicating stable discrimination under standardized analytical procedures. Earlier exploratory analyses yielded higher apparent AUC values; however, these were not derived exclusively from OOF predictions under the same pipeline and are therefore not presented as primary performance estimates. The primary contribution of this study is methodological: it demonstrates that a GA-derived score can achieve stable OOF discrimination across consecutive cohorts when preprocessing, optimization, and evaluation are uniformly applied, rather than emphasizing peak performance under heterogeneous analytical conditions.</p><p>Because performance estimates were based exclusively on OOF predicted probabilities, the evaluation preserved internal validation and minimized information leakage. Variability observed in earlier exploratory analyses likely reflected differences in analytical procedures rather than underlying cohort characteristics, highlighting the importance of consistent preprocessing, optimization, and evaluation when assessing artificial intelligence&#x2013;based risk stratification models.</p><p>Importantly, the outcome definition reflects operational screening classification rather than confirmed clinical diagnoses. The composite label was constructed using threshold-based OR combinations across correlated clinical domains, and the dominant abnormality domain contributing to classification may vary across cohorts depending on distributional shifts. In this context, AUC values represent the model&#x2019;s ability to consistently reconstruct the structured screening framework under harmonized analytical conditions rather than discrimination of severity-specific disease states.</p><p>Methodologically, GA optimization produced a composite score from standardized predictors, which was subsequently mapped to calibrated predicted probabilities using logistic regression. Bayesian updating was applied post hoc as an interpretability add-on to the calibrated predicted probabilities and did not influence GA optimization or probability estimation. The explicit specification of evolutionary hyperparameters, weight initialization range, and random-seed control further strengthens the reproducibility of the optimization procedure and reduces the likelihood that the observed discrimination reflects stochastic artifacts.</p><p>The prespecified HbA1c ON/OFF sensitivity analysis showed minimal changes in AUC and Brier score, suggesting limited sensitivity to policy-driven structural HbA1c missingness under the standardized evaluation framework; however, re-optimizing GA weights under an altered feature set represents a distinct modeling exercise.</p><p>This study has limitations. It was conducted at a single center in Japan using an occupational health checkup population. The outcome was cross-sectional, and prospective or external validation was not performed. In addition, because grade B or higher includes heterogeneous categories with varying clinical implications (ranging from lifestyle guidance to referral for diagnostic evaluation), model discrimination reflects detection of operational abnormality rather than exclusively high-severity disease states. Furthermore, because the outcome was defined using threshold-based criteria that partially overlap with included predictors, discrimination should be interpreted as reconstruction of the operational classification within the same workflow rather than independent prognostic discrimination. This endpoint is an operational classification designed for workflow consistency, not a clinically adjudicated diagnosis. Accordingly, the present OOF-based results provide an internally validated assessment of temporal reproducibility within this screening system.</p></sec><sec id="s4-2"><title>Conclusions</title><p>In conclusion, under a prespecified, harmonized OOF validation framework within a single institutional screening system, the GA-derived composite risk score demonstrated stable temporal discrimination and consistent overall predictive performance across three consecutive annual cohorts. These findings support methodological reproducibility and structural consistency under standardized analytical procedures. However, the present results do not establish external generalizability or clinical effectiveness, and independent external and prospective validation is required before broader clinical implementation can be inferred.</p></sec></sec></body><back><ack><p>The authors thank the staff of the Preventive Medicine Center (Ningen Dock Division), Yoshida Hospital-Keiyukai Medical Corporation, including Junko Suzuki, Masami Takahashi, Eri Kagaya, Miki Sato, Mikiko Shibuya, Toshiharu Hazeyama, and Kouichi Kagi, for their assistance with data collection and management. The authors also thank the participants of the health checkup programs for their cooperation. The institution was certified as a government-recognized clinical research center (MEXT-approved, ID: 90106, August 2024).</p></ack><notes><sec><title>Funding</title><p>This research received no external funding and was conducted as part of the institutional research activities of Keiyukai Medical Corporation.</p></sec><sec><title>Data Availability</title><p>The datasets generated and analyzed during the current study are not publicly available due to institutional policy and ethical restrictions. Deidentified data may be made available from the corresponding author upon reasonable request and with approval by the institutional review board.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: YA</p><p>Data curation: RY</p><p>Formal analysis: YA</p><p>Methodology: YA</p><p>Validation: HT, KY</p><p>Writing &#x2013; original draft: YA</p><p>Writing &#x2013; review &#x0026; editing: YA, HT, KY, RY</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb2">GA</term><def><p>genetic algorithm</p></def></def-item><def-item><term id="abb3">HbA<sub>1c</sub></term><def><p>hemoglobin A1c</p></def></def-item><def-item><term id="abb4">OOF</term><def><p>out-of-fold</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Holland</surname><given-names>JH</given-names> </name></person-group><source>Adaptation in Natural and Artificial Systems</source><year>1975</year><publisher-name>University of Michigan Press</publisher-name><pub-id pub-id-type="other">9780262581110</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Goldberg</surname><given-names>DE</given-names> </name></person-group><source>Genetic Algorithms in Search, Optimization, and Machine Learning</source><year>1989</year><publisher-name>Addison-Wesley</publisher-name><pub-id pub-id-type="other">9780201157673</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Platt</surname><given-names>J</given-names> </name></person-group><article-title>Probabilities for SV machines</article-title><source>Advances in Large Margin Classifiers</source><year>1999</year><publisher-name>MIT Press</publisher-name><fpage>61</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.7551/mitpress/1113.003.0008</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pedregosa</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Scikit-learn: machine learning in Python</article-title><source>J Mach Learn Res</source><year>2011</year><access-date>2026-04-10</access-date><volume>12</volume><fpage>2825</fpage><lpage>2830</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://jmlr.org/papers/v12/pedregosa11a.html">https://jmlr.org/papers/v12/pedregosa11a.html</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fortin</surname><given-names>FA</given-names> </name></person-group><article-title>DEAP: evolutionary algorithms made easy</article-title><source>J Mach Learn Res</source><year>2012</year><access-date>2026-04-10</access-date><volume>13</volume><fpage>2171</fpage><lpage>2175</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/v13/fortin12a.html">https://www.jmlr.org/papers/v13/fortin12a.html</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary materials including Tables S1&#x2013;S5, additional methodological details (GA implementation and Bayesian risk update), calibration metrics, and prespecified HbA<sub>1c</sub> ON/OFF sensitivity analyses.</p><media xlink:href="bioinform_v7i1e85659_app1.docx" xlink:title="DOCX File, 429 KB"/></supplementary-material></app-group></back></article>