<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Bioinform Biotech</journal-id><journal-id journal-id-type="publisher-id">bioinform</journal-id><journal-id journal-id-type="index">19</journal-id><journal-title>JMIR Bioinformatics and Biotechnology</journal-title><abbrev-journal-title>JMIR Bioinform Biotech</abbrev-journal-title><issn pub-type="epub">2563-3570</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v6i1e70621</article-id><article-id pub-id-type="doi">10.2196/70621</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Optimizing Feature Selection and Machine Learning Algorithms for Early Detection of Prediabetes Risk: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Almadhoun</surname><given-names>Mahmoud B</given-names></name><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Burhanuddin</surname><given-names>MA</given-names></name><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Fakulti Kecerdasan Buatan dan Keselamatan Siber, Universiti Teknikal Malaysia</institution><addr-line>Melaka</addr-line><addr-line>Durian Tunggal</addr-line><country>Malaysia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Uzun</surname><given-names>Alper</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Placencia</surname><given-names>Greg</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Awadallah</surname><given-names>Mohammed A</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Atallah</surname><given-names>Rasha</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gonzalez</surname><given-names>Luis I. Lopera</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to MA Burhanuddin, Fakulti Kecerdasan Buatan dan Keselamatan Siber, Universiti Teknikal Malaysia, Melaka, Durian Tunggal, 75450, Malaysia, 60 194807552; <email>burhanuddin@utem.edu.my</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>31</day><month>7</month><year>2025</year></pub-date><volume>6</volume><elocation-id>e70621</elocation-id><history><date date-type="received"><day>28</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>03</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>20</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Mahmoud B Almadhoun, MA Burhanuddin. Originally published in JMIR Bioinformatics and Biotechnology (<ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org">https://bioinform.jmir.org</ext-link>), 31.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">http://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Bioinformatics and Biotechnology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org/">https://bioinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://bioinform.jmir.org/2025/1/e70621"/><abstract><sec><title>Background</title><p>Prediabetes is an intermediate stage between normal glucose metabolism and diabetes and is associated with increased risk of complications like cardiovascular disease and kidney failure.</p></sec><sec><title>Objective</title><p>It is crucial to recognize individuals with prediabetes early in order to apply timely intervention strategies to decelerate or prohibit diabetes development. This study aims to compare the effectiveness of machine learning (ML) algorithms in predicting prediabetes and identifying its key clinical predictors.</p></sec><sec sec-type="methods"><title>Methods</title><p>Multiple ML models are evaluated in this study, including random forest, extreme gradient boosting (XGBoost), support vector machine (SVM), and <italic>k</italic>-nearest neighbors (KNNs), on a dataset of 4743 individuals. For improved performance and interpretability, key clinical features were selected using LASSO (Least Absolute Shrinkage and Selection Operator) regression and principal component analysis (PCA)<italic>.</italic> To optimize model accuracy and reduce overfitting, we used hyperparameter tuning with RandomizedSearchCV for XGBoost and random forest, and GridSearchCV for SVM and KNN. SHAP (Shapley Additive Explanations) was used to assess model-agnostic feature importance. To resolve data imbalance, SMOTE (Synthetic Minority Oversampling Technique) was applied to ensure reliable classifications.</p></sec><sec sec-type="results"><title>Results</title><p>A cross-validated ROC-AUC (receiver operating characteristic area under the curve) score of 0.9117 highlighted the robustness of random forest in generalizing across datasets among the models tested. XGBoost followed closely, providing balanced accuracy in distinguishing between normal and prediabetic cases. While SVMs and KNNs performed adequately as baseline models, they exhibited limitations in sensitivity. The SHAP analysis indicated that BMI, age, high-density lipoprotein cholesterol, and low-density lipoprotein cholesterol emerged as the key predictors across models. The performance was significantly enhanced through hyperparameter tuning; for example, the ROC-AUC for SVM increased from 0.813 (default) to 0.863 (tuned). PCA kept 12 components while maintaining 95% of the variance in the dataset.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>It is demonstrated in this research that optimized ML models, especially random forest and XGBoost, are effective tools for assessing early prediabetes risk. Combining SHAP analysis with LASSO and PCA enhances transparency, supporting their integration in real-time clinical decision support systems. Future directions include validating these models in diverse clinical settings and integrating additional biomarkers to improve prediction accuracy, offering a promising avenue for early intervention and personalized treatment strategies in preventive health care.</p></sec></abstract><kwd-group><kwd>prediabetes</kwd><kwd>machine learning</kwd><kwd>feature selection</kwd><kwd>prediction</kwd><kwd>extreme gradient boosting</kwd><kwd>support vector machine</kwd><kwd>k-nearest neighbors</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>A prediabetic state is characterized by elevated blood sugar levels, considered as an intermediate stage between normal glucose metabolism and type 2 diabetes [<xref ref-type="bibr" rid="ref1">1</xref>]. In individuals with a high risk of diabetes, cardiovascular disease, and kidney complications, early diagnosis and intervention in prediabetes is important for delaying or preventing progression to diabetes [<xref ref-type="bibr" rid="ref2">2</xref>]. In spite of lifestyle interventions, adherence remains one of the biggest challenges, which necessitates early and accurate detection.</p><p>While biochemical markers like fasting glucose and glycated hemoglobin are valuable, they may not capture the full spectrum of prediabetes risk factors, resulting in missed diagnoses and delayed interventions. To address this, a wide set of predictors, including clinical and genetic data, needs to be incorporated. This issue can be overcome by machine learning (ML), which can analyze complex relationships between a broad range of biomarkers [<xref ref-type="bibr" rid="ref3">3</xref>]. By leveraging ML algorithms, this study aims to enhance the accuracy of prediabetes risk assessment and early detection.</p><p>A feature selection technique such as LASSO (Least Absolute Shrinkage and Selection Operator) regression and principal component analysis (PCA) further optimizes these models by focusing on the most apropos predictors, as a consequence improving both efficiency and interpretability [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Additionally, it reduces model complexity and boosts prediction accuracy by eliminating irrelevant or unnecessary data in ML. Models based on the most impactful clinical features, such as BMI, age, low-density lipoprotein cholesterol (LDL-C), and high-density lipoprotein cholesterol (HDL-C), can capture underlying patterns linked with prediabetes [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>This paper assesses and compares the predictive power of various ML algorithms such as random forest, extreme gradient boosting (XGBoost), support vector machine (SVM), and <italic>k</italic>-nearest neighbors (KNNs), inclusive of feature selection methods such as LASSO and PCA. We aim to identify the most effective model and feature selection technique for the detection of early prediabetes, ultimately contributing to highly accurate diagnostics and personalized prevention.</p><p>In this study, key predictors such as BMI, age, LDL-C, and HDL-C were identified, which may refine diagnostic criteria and help with targeted prevention. The findings emphasize the capability for ML-based tools to improve prediabetes management and foster better patient outcomes through early intervention.</p><p>Various ML models have been used in recent studies to enhance detection accuracy and identify key risk factors associated with prediabetes progression. These approaches underscore the potential of ML in developing effective and clinically applicable prediction models for prediabetes risk.</p><p>An important direction is using ensemble and decision tree&#x2013;based models to predict prediabetes. A study by Liu et al [<xref ref-type="bibr" rid="ref7">7</xref>] evaluated logistic regression, decision trees, random forests, and XGBoost to predict diabetes progression in older patients with prediabetes. XGBoost was the most accurate model (60.66%), but its generalizability was limited by the dataset&#x2019;s narrow demographic scope. In spite of a minor decline in predictive performance over time, XGBoost showed promise as a model for identifying prediabetes risk factors among older adults. Similarly, Abbas et al [<xref ref-type="bibr" rid="ref8">8</xref>] developed a model of prediabetes risk score for a Middle Eastern cohort based on random forest, gradient boosting, XGBoost, and deep learning. This model effectively screens risk across different groups of individuals by analyzing demographic and physiological factors, including age, blood pressure, BMI, waist size, and gender. Primary care settings could benefit from the study&#x2019;s focus on noninvasive, easily measurable variables.</p><p>Additionally, tree-based models, logistic regression, and LASSO have been commonly used to refine prediabetes risk prediction. Hu et al [<xref ref-type="bibr" rid="ref9">9</xref>] developed a personalized nomogram that predicted 5-year prediabetes risk among Chinese adults. Using stepwise selection, LASSO, and ML models, Hu et al [<xref ref-type="bibr" rid="ref9">9</xref>] found that the LASSO model provided the best performance with variables such as age, BMI, fasting blood glucose, and serum creatinine. As a result of this approach, LASSO can generate an accurate yet efficient model even with a limited number of predictive features. In another logistic regression&#x2013;based study, Yu et al [<xref ref-type="bibr" rid="ref10">10</xref>] validated a prediabetes assessment model on a large Chinese dataset. Based on C statistics and calibration plots, the model demonstrated good discrimination, but a cohort study might improve its performance.</p><p>Efforts have also been made to incorporate nonlaboratory risk factors into predictive models. In a study by Dong et al [<xref ref-type="bibr" rid="ref11">11</xref>], lifestyle factors such as sleep duration and recreational activity were incorporated into a model using logistic regression and interpretable ML techniques, especially XGBoost. SHAP (Shapley Additive Explanations) was used to determine variable significance, revealing that lifestyle variables are crucial to the model&#x2019;s detection efficiency. By incorporating clinical and lifestyle predictors, XGBoost can identify undiagnosed prediabetes and diabetes, offering a more comprehensive risk assessment.</p><p>As a result of these studies, we can observe that ensemble methods (random forest and XGBoost), regression-based approaches (logistic regression and LASSO), and interpretable ML models (eg, SHAP-enhanced XGBoost) all offer unique strengths in predicting prediabetes risk. According to the results, while tree-based models and ensemble models tend to be more accurate, regression techniques such as LASSO help create interpretable, efficient models, especially when resources are limited.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Dataset</title><p>This study used a dataset that is publicly accessible, which includes health records from 4743 individuals who were examined at the Health Management Center of Peking University Shenzhen Hospital from January 2020 to March 2023. The World Health Organization standards were followed when assessing fasting blood glucose levels, random blood glucose levels, oral glucose tolerance tests, and glycated hemoglobins of participants. Prediabetes was diagnosed if fasting blood glucose was between 6.1 and 6.9 mmol/L or if the blood glucose level was between 7.8 and 11.0 mmol/L after oral glucose tolerance test. Based on glucose metabolism status, participants were classified into 2 groups: normal (1593/4743, 33.6%) and prediabetes (3150/4743, 66.4%). The dataset included 22 features, comprising demographic, clinical, and laboratory variables such as age, BMI, HDL-C, and fasting blood glucose levels. The target variable for the study was binary, with participants categorized as either normal or prediabetic. Since this dataset is open to the public and anonymized, numeric values for individual IDs were preserved for traceability in the preprocessing phase, but they do not contain any personally identifiable information.</p></sec><sec id="s2-2"><title>Variable Assignment and Data Categorization</title><p>In this study, the dataset includes both categorical and numerical variables. The categorical variables, such as status, gender, urine glucose, and urine protein, were assigned specific values to facilitate analysis. These values allow for easy differentiation between groups or conditions. On the other hand, continuous or numerical variables, such as age, BMI, and various blood and urine biomarkers, were used as-is without specific value assignments since they naturally provide a range of measurements. <xref ref-type="table" rid="table1">Table 1</xref> shows the assigned values for each of the categorical variables.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Dataset variables and descriptions for prediabetes risk assessment.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Variable name</td><td align="left" valign="bottom">Meaning of variable</td><td align="left" valign="bottom">Type of variable</td><td align="left" valign="bottom">Assignment description</td></tr></thead><tbody><tr><td align="left" valign="top">Status</td><td align="left" valign="top">Glucose metabolic status</td><td align="left" valign="top">Categorical variable</td><td align="left" valign="top">1=normal, 2=prediabetes</td></tr><tr><td align="left" valign="top">Age</td><td align="left" valign="top">Age</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">Gender</td><td align="left" valign="top">Gender</td><td align="left" valign="top">Categorical variable</td><td align="left" valign="top">0=female, 1=male</td></tr><tr><td align="left" valign="top">BMI</td><td align="left" valign="top">Body mass index</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">SBP</td><td align="left" valign="top">Systolic blood pressure</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">U-GLU</td><td align="left" valign="top">Urine glucose</td><td align="left" valign="top">Categorical variable</td><td align="left" valign="top">0=negative, 1=positive</td></tr><tr><td align="left" valign="top">PRO</td><td align="left" valign="top">Urine protein</td><td align="left" valign="top">Categorical variable</td><td align="left" valign="top">0=negative, 1=positive</td></tr><tr><td align="left" valign="top">TP</td><td align="left" valign="top">Total protein</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">ALB</td><td align="left" valign="top">Albumin</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">GLB</td><td align="left" valign="top">Globulin</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">T-BIL</td><td align="left" valign="top">Total bilirubin</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">DB</td><td align="left" valign="top">Direct bilirubin</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">IB</td><td align="left" valign="top">Indirect bilirubin</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">ALT</td><td align="left" valign="top">Alanine aminotransferase</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">AST</td><td align="left" valign="top">Aspartate transaminase</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">BUN</td><td align="left" valign="top">Blood urea nitrogen</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">SCr</td><td align="left" valign="top">Serum creatinine</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">UA</td><td align="left" valign="top">Uric acid</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">TC</td><td align="left" valign="top">Total cholesterol</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">TG</td><td align="left" valign="top">Triglycerides</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">HDL-C</td><td align="left" valign="top">High-density lipoprotein cholesterol</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr><tr><td align="left" valign="top">LDL-C</td><td align="left" valign="top">Low-density lipoprotein cholesterol</td><td align="left" valign="top">Numerical variable</td><td align="left" valign="top">Is unassigned</td></tr></tbody></table></table-wrap></sec><sec id="s2-3"><title>Data Preprocessing</title><sec id="s2-3-1"><title>Overview</title><p>For improved model performance, data preprocessing involved handling missing values through mean imputation, balancing the dataset using SMOTE (Synthetic Minority Oversampling Technique), and scaling features with StandardScaler() and MinMaxScaler(). Through these steps, the dataset was optimized for building reliable ML models for prediabetes risk prediction.</p></sec><sec id="s2-3-2"><title>Handling Missing Data</title><p>Missing values were imputed using the mean of the corresponding feature, guaranteeing consistency and completeness in the dataset.</p></sec><sec id="s2-3-3"><title>Balancing the Dataset</title><p>The dataset has an imbalanced class distribution, with 33.6% (1593/4743) representing the normal group (status=1) and 66.4% (3150/4743) representing the prediabetes group (status=2). This type of imbalance can influence the performance of classification models, specifically incorrectly predicting the minority class (normal group in this case), so SMOTE was used to oversample the minority class (normal group). This step ensured that the ML models were not biased toward the larger class, improving predictive performance [<xref ref-type="bibr" rid="ref12">12</xref>], particularly for prediabetes detection.</p></sec><sec id="s2-3-4"><title>Scaling and Normalization</title><p>Scaling and normalization are pivotal steps when preparing continuous variables for models such as KNN, SVM, and LASSO, which are sensitive to feature scaling. To address this, the features are standardized using the &#x201C;StandardScaler(),&#x201D; which tunes them to have a mean of 0 and an SD of 1. This standardization guarantees that all features are on a similar scale and refines model performance. In addition, normalization can be applied using the &#x201C;MinMaxScaler(),&#x201D; which transforms the data into a range between 0 and 1 [<xref ref-type="bibr" rid="ref13">13</xref>].</p></sec></sec><sec id="s2-4"><title>Exploratory Data Analysis</title><p>To obtain an understanding of the relationship across several features and to pick out any patterns, trends, or correlations that may guide next steps, the dataset was completely explored before applying predictive models. Heatmaps were used to visualize the relationship between numerical variables as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The main goal of this step is to gain a fruitful understanding of the raw data and arrange it for additional analysis [<xref ref-type="bibr" rid="ref14">14</xref>]. Among the assessed models, SHAP analysis was performed solely on the XGBoost classifier due to its alignment with the TreeExplainer framework. Models based on trees benefit from SHAP&#x2019;s precise additive feature attributions, which are computationally efficient and theoretically robust. XGBoost&#x2019;s built-in support for SHAP made it more interpretable than other models (eg, SVM, KNN, and random forest).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Heatmap distribution of the dataset features. ALB: albumin; ALT: alanine aminotransferase; AST: aspartate transaminase; BUN: blood urea nitrogen; DB: direct bilirubin; GLB: globulin; HDL-C: high-density lipoprotein cholesterol; IB: indirect bilirubin; LDL-C: low-density lipoprotein cholesterol; SBP: systolic blood pressure; SCr: serum creatinine; T-BIL: total bilirubin; TC: total cholesterol; TG: triglyceride; TP: total protein; UA: uric acid.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70621_fig01.png"/></fig></sec><sec id="s2-5"><title>Features Selection</title><sec id="s2-5-1"><title>Overview</title><p>Two principal features selection techniques were applied after the data exploration phase to choose the most relevant and informative variables. A suitable feature selection not only enhances the performance and interpretability of a model but also reduces computational complexity and the risk of overfitting [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s2-5-2"><title>LASSO Regression</title><p>LASSO regression was used as the first method for feature selection. The LASSO method reduces the number of variables by shrinking the coefficients of less important features to zero, which effectively eliminates them from the model [<xref ref-type="bibr" rid="ref16">16</xref>]. It is mostly useful for handling multicollinearity as it automatically picks one feature from a set of highly correlated features such as LDL-C and total cholesterol based on the correlation heatmap.</p></sec><sec id="s2-5-3"><title>About PCA</title><p>The main aim of this technique is to reduce dimensionality in the dataset by transforming the base features into a smaller set of uncorrelated components while keeping most of the variance in the data [<xref ref-type="bibr" rid="ref17">17</xref>]. In models facing overfitting, such as SVM and XGBoost, PCA reduced multicollinearity and compressed the retaining 95% of the variance in the data. Additionally, PCA reduced the number of variables, simplifying the model and making it more computationally efficient [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Before training the predictive models, these features selection techniques were applied. Using only relevant predictors improved model performance and generalizability. By using a structured approach to data exploration and features selection, we lay a strong foundation for building and evaluating ML models for prediabetes risk prediction in the next phase.</p></sec></sec><sec id="s2-6"><title>Model Development</title><sec id="s2-6-1"><title>Overview</title><p>In this study, different ML models were used to predict the onset of prediabetes. These algorithms were selected due to their ability to handle high-dimensional data, interpretability, and performance in classification tasks. As well as each model was tuned and evaluated to optimize performance for prediabetes detection.</p></sec><sec id="s2-6-2"><title>XGBoost</title><p>XGBoost is a powerful gradient-boosting algorithm that constructs an ensemble of decision trees to improve classification accuracy. Each one tree is sequentially trained to emend the errors of the previous trees, which makes it more powerful for tasks with complex relationships between features [<xref ref-type="bibr" rid="ref19">19</xref>]. XGBoost is known for its performance and speed in handling big datasets, which makes it appropriate for medical prediction tasks like prediabetes diagnosis. In addition, XGBoost applies regularized boosting techniques to overcome the difficulty of the model and correct overfitting; as a result, increasing model accuracy [<xref ref-type="bibr" rid="ref20">20</xref>].</p></sec><sec id="s2-6-3"><title>Random Forest</title><p>Random forest is an ensemble learning approach that constructs numerous decision trees during training. Every tree is built using a random subset of features and data samples, and the last prediction is made by averaging the predictions from all trees [<xref ref-type="bibr" rid="ref21">21</xref>]. Random forest minimizes the risk of overfitting by using a bagging approach and tends to accomplish well on classification issues such as prediabetes detection.</p></sec><sec id="s2-6-4"><title>About SVM</title><p>SVM is a supervised learning model that separates data points into distinct classes by finding an optimal hyperplane. For the complex relationships between predictors, such as BMI and age, a nonlinear kernel was applied. This method is suited for medical diagnosis since the decision boundary is not linearly separable in high-dimensional spaces [<xref ref-type="bibr" rid="ref22">22</xref>].</p></sec><sec id="s2-6-5"><title>About KNNs</title><p>KNN is an uncomplicated, nonparametric classifier that specifies the class label based on the most votes of the KNNs in the feature space [<xref ref-type="bibr" rid="ref23">23</xref>]. In this study, KNN was used after scaling the features, and the optimal number of neighbors was set through hyperparameter tuning. Despite KNN being computationally intensive for big datasets, its clarity and interpretability make it a beneficial model for prediabetes classification.</p></sec></sec><sec id="s2-7"><title>Hyperparameter Tuning and Cross-Validation</title><sec id="s2-7-1"><title>Overview</title><p>Hyperparameter tuning was used for all models to recognize the optimal settings for each algorithm. To achieve that, we used GridSearchCV and RandomizedSearchCV, which systematically explore a range of hyperparameters and choose the set that maximizes model performance.</p></sec><sec id="s2-7-2"><title>GridSearchCV</title><p>All combinations of hyperparameters are assessed exhaustively through a particular parameter grid. It is a systematic approach to identifying the effective parameter set [<xref ref-type="bibr" rid="ref17">17</xref>]. With large datasets and complex models, it can be computationally expensive, so this study used GridSearchCV for models with a relatively small hyperparameter search space, which made it feasible to explore all combinations. The KNN algorithm was tuned by tuning the number of neighbors (<italic>k</italic>) and the distance metric.</p></sec><sec id="s2-7-3"><title>RandomizedSearchCV</title><p>A randomized search of the hyperparameter space selects hyperparameter settings from the specified ranges [<xref ref-type="bibr" rid="ref24">24</xref>]. It is more efficient than GridSearchCV when the search space is large because it explores a representative sample of possible combinations instead of testing them all. We used this technique for more complicated models such as random forest and XGBoost when the number of hyperparameters and possible values was too large for a wide search. RandomizedSearchCV assists with identifying optimal hyperparameters by setting a limit on the number of iterations (eg, 40).</p></sec></sec><sec id="s2-8"><title>Tuning Process for Each Model</title><sec id="s2-8-1"><title>XGBoost</title><p>The hyperparameters, such as the maximum tree depth, the learning rate, and the subsample ratio, were tuned using RandomizedSearchCV. This approach allowed for a more efficient search through a vast range of parameter values, making it fit for models with big parameter spaces. Random sampling allowed the tuning process to explore a diversity of hyperparameter combinations while preventing overfitting and maximizing classification accuracy.</p></sec><sec id="s2-8-2"><title>Random Forest</title><p>To optimize hyperparameters such as the number of trees, maximum tree depth, and minimum samples required for a part, RandomizedSearchCV was used. This approach is selected for random forest because of the large search space, as it can easily sample a subset of hyperparameters to explore near-optimal settings.</p></sec><sec id="s2-8-3"><title>About SVM</title><p>To fine-tune hyperparameters such as the kernel type and penalty parameter C, GridSearchCV was used. Due to the smaller search space for SVM, GridSearchCV is considered the best choice because this approach performs a wide search over the specified parameter values, so it guarantees to find the best possible combinations for the model.</p></sec><sec id="s2-8-4"><title>About KNNs</title><p>To tune the distance metrics (eg, Euclidean or Manhattan distance) and number of neighbors (<italic>k</italic>), the GridSearchCV method was applied. This approach is useful to pick out the most effective neighborhood size and similarity measures for predicting prediabetes.</p><p>This tuning strategy guaranteed that every model was fine-tuned to work optimally for prediabetes prediction.</p></sec></sec><sec id="s2-9"><title>Cross-Validation Approach</title><p>The tuning process for each model included <italic>k</italic>-fold cross-validation to ensure reliable performance estimation and reduce the risk of overfitting. In <italic>k</italic>-fold cross-validation:</p><list list-type="bullet"><list-item><p>The dataset is divided into <italic>k</italic> equal-sized subsets (folds).</p></list-item><list-item><p>The model is trained on <italic>k</italic> &#x2013; 1 folds and tested on the remaining fold. This process is repeated <italic>k</italic> times, with each fold serving as the test set once. The results are averaged to get a final evaluation metric.</p></list-item><list-item><p>5-fold cross-validation was used in this study, which balances computational cost and model evaluation reliability.</p></list-item></list><p>Through cross-validation, a robust estimate of model performance across various subsets of data is obtained by evaluating how well the model generalizes to unseen data [<xref ref-type="bibr" rid="ref25">25</xref>]. To choose the best-performing parameter set, this method was used during hyperparameter tuning.</p></sec><sec id="s2-10"><title>Model Evaluation Metrics</title><sec id="s2-10-1"><title>Overview</title><p>To evaluate the performance of ML models, various metrics were applied.</p></sec><sec id="s2-10-2"><title>Accuracy</title><p>This is the measure of the percentage of true predictions made by the model out of all predictions. Nevertheless, accuracy alone can be misleading, particularly when the classes are imbalanced, as in the case of prediabetes diagnosis.</p></sec><sec id="s2-10-3"><title>Precision</title><p>The proportion of true positive predictions to the total number of positive predictions. High precision indicates that the model produces few false positive errors, which is important in minimizing irrelevant treatments.</p></sec><sec id="s2-10-4"><title>Recall (Sensitivity)</title><p>The ratio of correct positive predictions to the total actual positives. A higher recall means fewer cases of prediabetes were missed, making it necessary for early prediabetes diagnosis.</p></sec><sec id="s2-10-5"><title><italic>F</italic><sub>1</sub>-Score</title><p>The harmonic means of precision and recall contribute a balance between both metrics. It is mainly valuable when false positives and false negatives have serious consequences.</p></sec><sec id="s2-10-6"><title>ROC-AUC Score</title><p>The ROC-AUC (receiver operating characteristic area under the curve) assesses the capability of the model to distinguish between both classes (normal and prediabetes). The ROC-AUC score provides an aggregate measure of performance throughout all classification thresholds, where a higher value refers to superior model performance.</p></sec><sec id="s2-10-7"><title>Cross-Validated ROC-AUC</title><p>In addition to evaluating ROC-AUC on the test set, cross-validated ROC-AUC provides a more reliable estimate of the model&#x2019;s ability to generalize. This metric was calculated using <italic>k</italic>-fold cross-validation, giving a better indication of how the model will perform on unseen data.</p><p>By using these evaluation metrics, the comparative performance of the ML models was assessed, with a particular focus on balancing accuracy, precision, recall, and <italic>F</italic><sub>1</sub>-score to ensure reliable predictions for prediabetes risk assessment.</p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>XGBoost, Random Forest, SVM, and KNN</title><p>This section provides a comparative evaluation of the ML models applied in this study&#x2014;XGBoost, random forest, SVM, and KNN&#x2014;along with the results of feature selection techniques, such as LASSO regression and PCA. The performance of each model is assessed using multiple evaluation metrics, including accuracy, precision, recall, <italic>F</italic><sub>1</sub>-score, and ROC-AUC scores, on both the test set and cross-validation. <xref ref-type="table" rid="table2">Table 2</xref> shows the performance metrics comparison of the ML models.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Performance metrics comparison of machine learning models.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Model</td><td align="left" valign="top">Accuracy (%)</td><td align="left" valign="top">Precision</td><td align="left" valign="top">Recall</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">ROC-AUC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> (test set)</td><td align="left" valign="top">Cross-validated ROC-AUC</td></tr></thead><tbody><tr><td align="left" valign="top">XGBoost<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">74.7</td><td align="left" valign="top">0.8128</td><td align="left" valign="top">0.7889</td><td align="left" valign="top">0.8007</td><td align="left" valign="top">0.7930</td><td align="left" valign="top">0.8600</td></tr><tr><td align="left" valign="top">Random forest</td><td align="left" valign="top">75.9</td><td align="left" valign="top">0.8391</td><td align="left" valign="top">0.7169</td><td align="left" valign="top">0.7732</td><td align="left" valign="top">0.8030</td><td align="left" valign="top">0.9117</td></tr><tr><td align="left" valign="top">SVM<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">73.9</td><td align="left" valign="top">0.6260</td><td align="left" valign="top">0.6686</td><td align="left" valign="top">0.6466</td><td align="left" valign="top">0.7791</td><td align="left" valign="top">0.8630</td></tr><tr><td align="left" valign="top">KNN<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">70.8</td><td align="left" valign="top">0.6901</td><td align="left" valign="top">0.6881</td><td align="left" valign="top">0.6890</td><td align="left" valign="top">0.7845</td><td align="left" valign="top">0.8397</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>ROC-AUC: receiver operating characteristic area under the curve.</p></fn><fn id="table2fn2"><p><sup>b</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table2fn3"><p><sup>c</sup>SVM: support vector machine.</p></fn><fn id="table2fn4"><p><sup>d</sup>KNN: <italic>k</italic>-nearest neighbor.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Model Performance Comparison</title><sec id="s3-2-1"><title>Overview</title><p>The following subsections present the comparative results of XGBoost, random forest, SVM, and KNN models, each fine-tuned using hyperparameter optimization and evaluated using key performance metrics.</p></sec><sec id="s3-2-2"><title>XGBoost</title><p>Based on 5-fold cross-validation, the XGBoost model showed a cross-validated ROC-AUC score of 0.86, indicating powerful discrimination between normal and prediabetic cases. In addition, the model achieved a precision of 0.8128, a recall of 0.7889, and an <italic>F</italic><sub>1</sub>-score of 0.8007 for the prediabetes class. This balanced performance emphasizes the model&#x2019;s strength to effectively minimize both false positives and false negatives, making it an effective method of prediabetes detection.</p></sec><sec id="s3-2-3"><title>Random Forest</title><p>The random forest model achieved an excellent performance with a cross-validated ROC-AUC score of 0.9117, demonstrating its capability to generalize well across various subsets of the data. The model demonstrated a precision of 0.8391, a recall of 0.7169, and an <italic>F</italic><sub>1</sub>-score of 0.7732 for the prediabetes class. This indicates that the random forest model not only lowers the likelihood of false positives but also keeps a powerful recall rate, guaranteeing that fewer cases of prediabetes are missed.</p></sec><sec id="s3-2-4"><title>About SVM</title><p>An SVM model, evaluated through 5-fold cross-validation, achieved a cross-validated ROC-AUC score of 0.8630, indicating its ability to distinguish between normal and prediabetic cases with high accuracy. For the prediabetes class, the model achieved a precision of 0.6260, a recall of 0.6686, and an <italic>F</italic><sub>1</sub>-score of 0.6466. Despite the SVM model providing a moderate balance between precision and recall, its recall score indicates potential for missing fewer prediabetic cases, making it a feasible choice for early-stage diagnosis.</p></sec><sec id="s3-2-5"><title>About KNNs</title><p>The KNN model, evaluated using 5-fold cross-validation, demonstrated a cross-validated ROC-AUC score of 0.8397, reflecting its ability to differentiate between normal and prediabetic cases with moderate effectiveness. The model recorded a precision of 0.6901, a recall of 0.6881, and an <italic>F</italic><sub>1</sub>-score of 0.6890 for the prediabetes class. Although KNN performed slightly lower in terms of accuracy and precision compared to other models, it still provides an interpretable solution for prediabetes.</p></sec></sec><sec id="s3-3"><title>Performance Enhancement Through Hyperparameter Tuning</title><p>To optimize the performance of SVM and KNN, we used GridSearchCV for hyperparameter tuning. For more complex models such as XGBoost and random forest, RandomizedSearchCV was used to efficiently explore broader hyperparameter spaces.</p><p><xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref> highlight the improvement in model performance after hyperparameter optimization. All 4 models&#x2014;XGBoost, random forest, SVM, and KNN&#x2014;showed notable gains in both ROC-AUC and <italic>F</italic><sub>1</sub>-score metrics. For instance, XGBoost&#x2019;s ROC-AUC improved from 0.782 to 0.860, and random forest&#x2019;s from 0.807 to 0.9117. These results confirm the effectiveness of using GridSearchCV and RandomizedSearchCV in tailoring model parameters to the dataset, ultimately boosting classification accuracy and robustness. This step is particularly critical for clinical applications, where small improvements in sensitivity or specificity can have substantial impacts on patient outcomes.</p><p>The parallel processing option n_jobs = &#x2013;1 was used to enable parallel processing. Each model required 3-8 minutes to be tuned on a standard multicore computer.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Hyperparameter tuning summary for all models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" colspan="2">Model and hyperparameter</td><td align="left" valign="top">Range or values tested</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">SVM<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">C</td><td align="left" valign="top">[0.1, 1, 10]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Kernel</td><td align="left" valign="top">['linear&#x2019;, 'rbf&#x2019;]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Gamma (rbf)</td><td align="left" valign="top">['scale&#x2019;, 'auto&#x2019;]</td></tr><tr><td align="left" valign="top" colspan="2">KNN<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">n_neighbors</td><td align="left" valign="top">[3, 5, 7, 9, 11]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Metric</td><td align="left" valign="top">['euclidean&#x2019;, 'manhattan&#x2019;]</td></tr><tr><td align="left" valign="top" colspan="2">XGBoost<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">n_estimators</td><td align="left" valign="top">[50, 100, 200, 300]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">learning_rate</td><td align="left" valign="top">[0.01, 0.05, 0.1, 0.2]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">max_depth</td><td align="left" valign="top">[3, 5, 7, 9]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Gamma</td><td align="left" valign="top">[0, 0.1, 0.3, 0.5]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Subsample</td><td align="left" valign="top">[0.6, 0.8, 1.0]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">colsample_bytree</td><td align="left" valign="top">[0.6, 0.8, 1.0]</td></tr><tr><td align="left" valign="top" colspan="2">Random forest</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">n_estimators</td><td align="left" valign="top">[50, 100, 200]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">max_depth</td><td align="left" valign="top">[None, 3, 5]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">min_samples_split</td><td align="left" valign="top">[2, 5]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">min_samples_leaf</td><td align="left" valign="top">[1, 2]</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">max_features</td><td align="left" valign="top">['sqrt&#x2019;, 'log2']</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Bootstrap</td><td align="left" valign="top">[True]</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>SVM: support vector machine.</p></fn><fn id="table3fn2"><p><sup>b</sup>KNN: <italic>k</italic>-nearest neighbor.</p></fn><fn id="table3fn3"><p><sup>c</sup>XGBoost: extreme gradient boosting.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Effect of hyperparameter tuning on model performance.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" colspan="2">Model and metric</td><td align="left" valign="top">Default</td><td align="left" valign="top">Tuned (GridSearchCV/RandomizedSearchCV)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">XGBoost<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">ROC-AUC<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.782</td><td align="left" valign="top">0.860</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.731</td><td align="left" valign="top">0.801</td></tr><tr><td align="left" valign="top" colspan="2">Random forest</td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">ROC-AUC</td><td align="left" valign="top">0.807</td><td align="left" valign="top">0.9117</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.742</td><td align="left" valign="top">0.773</td></tr><tr><td align="left" valign="top" colspan="2">SVM<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">ROC-AUC</td><td align="left" valign="top">0.813</td><td align="left" valign="top">0.863</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.591</td><td align="left" valign="top">0.646</td></tr><tr><td align="left" valign="top" colspan="2">KNN<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">ROC-AUC</td><td align="left" valign="top">0.805</td><td align="left" valign="top">0.839</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.652</td><td align="left" valign="top">0.689</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>XGBoost: extreme gradient boosting.</p></fn><fn id="table4fn2"><p><sup>b</sup>ROC-AUC: receiver operating characteristic area under the curve.</p></fn><fn id="table4fn3"><p><sup>c</sup>SVM: support vector machine.</p></fn><fn id="table4fn4"><p><sup>d</sup>KNN: <italic>k</italic>-nearest neighbor.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Descriptive Patterns From Exploratory Data Analysis Findings</title><sec id="s3-4-1"><title>Overview</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> shows several important patterns that emerged. The following features are highly correlated.</p></sec><sec id="s3-4-2"><title>Strong Positive Correlation</title><p>Total cholesterol and LDL-C exhibited a strong positive correlation. As a result, the model may be redundant due to those variables sharing similar information. One of these features could potentially be excluded in the feature selection phase if it has a high correlation. It was found that total protein and albumin exhibit a high correlation, suggesting that combining them may not provide more insight than using either separately.</p></sec><sec id="s3-4-3"><title>Weak or No Correlations</title><p>Correlations between variables such as age, BMI, and uric acid were weak or negligible. This is a significant finding because these variables may provide unique independent information that makes model-building more effective.</p></sec><sec id="s3-4-4"><title>Negative Correlation</title><p>A mild negative correlation was found between LDL-C and HDL-C, which is consistent with their known inverse roles in cardiovascular health. Age and HDL-C also exhibited a slight negative correlation, suggesting that lipid profiles might change with aging. Multicollinearity issues happen when highly correlated variables distort the model&#x2019;s ability to differentiate between them due to this exploration in sights. It is crucial to recognize such relationships early in the process so that multicollinearity can be handled, and redundant features can be dropped in the next step, features selection.</p><p>A summary plot of SHAP data derived from the XGBoost model is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. The most significant predictors are age, BMI, HDL-C, and LDL-C. As these variables are well-established risk factors for prediabetes, these findings support clinical intuition. Additionally, SHAP provided valuable visual confirmation that agreed with both the correlation analysis and the LASSO feature selection. Using these exploratory data analysis findings, LASSO regression and PCA were applied for feature selection, ensuring that informative predictors were retained while reducing redundancy and improving interpretability.<inline-graphic xlink:href="bioinform_v6i1e70621_fig02.png"/></p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>SHAP summary plot of XGBoost model. ALB: albumin; ALT: alanine aminotransferase; DB: direct bilirubin; HDL-C: high-density lipoprotein cholesterol; LDL-C: low-density lipoprotein cholesterol; PRO: urine protein; SBP: systolic blood pressure; SCr: serum creatinine; SHAP: Shapley Additive Explanations; TG: triglyceride; U-GLU: urine glucose; UA: uric acid; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70621_fig03.png"/></fig></sec></sec><sec id="s3-5"><title>Feature Importance and Selection</title><p>Feature selection over LASSO regression guaranteed that every model was trained on the most relevant predictors. During LASSO, features like BMI, age, and HDL-C were consistently identified as significant predictors of prediabetes as shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>. These features were retrained in the final model because of their significant predictive power across different iterations. The models differed in which features they emphasized:</p><list list-type="bullet"><list-item><p>XGBoost identified BMI as the most significant predictor, aligning with established research that links higher BMI with increased prediabetes risk.</p></list-item><list-item><p>SVM prioritized age as the first predictor, indicating that age may play an additional critical role when nonlinear relationships between variables are considered.</p></list-item></list><list list-type="bullet"><list-item><p>Random forest and KNN provide insights into other key features such as LDL-C and HDL-C, demonstrating the various aspects of the data that every algorithm emphasizes.</p></list-item></list><p>This variance in feature significance underscores the utility of designing diverse models and selection techniques to better understand the predictors of prediabetes risk.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Features importance plots for XGBoost and SVM. ALB: albumin; ALT: alanine aminotransferase; DB: direct bilirubin; HDL-C: high-density lipoprotein cholesterol; LDL-C: low-density lipoprotein cholesterol; PRO: urine protein; SBP: systolic blood pressure; SCr: serum creatinine; SR: ; SVM: support vector machine; TG: triglyceride; UA: uric acid; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70621_fig04.png"/></fig></sec><sec id="s3-6"><title>PCA Component Retention</title><p>PCA retained 12 principal components, accounting for 95% of the variance in the dataset.</p></sec><sec id="s3-7"><title>Confusion Matrices</title><sec id="s3-7-1"><title>Overview</title><p>As shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, the confusion matrix demonstrates that every model&#x2019;s classification performance is detailed in terms of distinguishing normal cases from prediabetic cases. These results reflect the trade-offs each model faces in terms of true positives, false positives, true negatives, and false negatives.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Confusion matrix for XGBoost, SVM, random forest, and KNN models. KNN: <italic>k</italic>-nearest neighbor; SVM: support vector machine; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70621_fig05.png"/></fig></sec><sec id="s3-7-2"><title>XGBoost</title><p>A comparatively balanced classification was accomplished with the XGBoost model, with 482 true positives and 227 true negatives, referring to good sensitivity. However, it recorded 129 false negatives and 111 false positives, proposing some limitations in minimizing misclassification errors, especially false negatives, which are pivotal in clinical settings.</p></sec><sec id="s3-7-3"><title>Random Forest</title><p>The random forest model (default threshold of 0.5) correctly identified 513 true positives and 208 true negatives, which are better results compared to XGBoost. The model demonstrated a higher sensitivity than other models, as it reduced the number of false negatives to 98. Despite this, 130 false positives were observed, which indicates a slightly higher trade-off in specificity.</p><p>A threshold adjustment of 0.2627 substantially improved the random forest&#x2019;s ability to detect prediabetic cases, resulting in 589 true positives and 22 false negatives. A notable rise in false positives (230) and a reduction in true negatives (108) resulted from this adjustment, indicating a move toward maximizing sensitivity over specificity. There may be some advantages to this configuration in scenarios where minimizing missed prediabetic cases is prioritized over averting false positives.</p></sec><sec id="s3-7-4"><title>About SVM</title><p>For the overall distribution of true positives and true negatives, the SVM model obtained 476 true positives and 226 true negatives, which is like XGBoost&#x2019;s. A total of 135 false negatives and 112 false positives have been recorded, indicating that while SVM has a strong classification capability, it is more susceptible to false negatives, which limits its effectiveness for early detection cases.</p></sec><sec id="s3-7-5"><title>About KNNs</title><p>This model performed moderately, generating 421 true positives and 251 true negatives. Even though KNN can effectively detect normal cases, it is less reliable when it comes to identifying prediabetic cases. It showed 190 false negatives and 87 false positives, indicating a higher rate of misclassification.</p><p>To summarize, the confusion matrices demonstrate that the random forest model minimizes false negatives better than other models, especially when thresholds are adjusted. Random forest has a significant advantage over XGBoost and SVM when it comes to sensitivity, which makes it particularly suitable for prediabetes detection, where minimizing missed cases is crucial. While KNN is the most effective at identifying normal cases, it lacks the discriminative power necessary to accurately classify prediabetes, illustrating that it may be more fit as a baseline or for smaller datasets.</p></sec></sec><sec id="s3-8"><title>ROC Curves</title><sec id="s3-8-1"><title>Overview</title><p><xref ref-type="fig" rid="figure5">Figure 5</xref> shows the ROC (receiver operating characteristic) curves for every model, further clarifies the trade-offs between sensitivity and specificity, and shows the performance of each model in terms of how well it separates between normal and prediabetic cases. The random forest model showed the most convenient ROC curve, while XGBoost and SVM also displayed powerful curves, suggesting effective categorization performance.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>ROC curve comparison across models. KNN: <italic>k</italic>-nearest neighbor; SVM: support vector machine; XGBoost: extreme gradient boosting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70621_fig06.png"/></fig></sec><sec id="s3-8-2"><title>XGBoost</title><p>This classifier showed an AUC (area under the curve) of 0.79. The XGBoost ROC curve reflects a relatively good trade-off between the true positive rate (sensitivity) and false positive rate (1 &#x2013; specificity), indicating that it is an effective classification model, but has some room for improvement in distinguishing classes.</p></sec><sec id="s3-8-3"><title>About SVM</title><p>The SVM classifier produced a slightly lower AUC of 0.78. However, the SVM struggles slightly more with false positives, as indicated by its ROC curve, which does not consistently approach the top-left corner. Despite this, it performs reasonably well when it comes to classification.</p></sec><sec id="s3-8-4"><title>Random Forest</title><p>Across the 4 models tested, the random forest model achieved the elevated AUC at 0.80. With a more pronounced upward curve, its ROC curve reflects better differentiation between positive and negative classes, showcasing outstanding classification abilities.</p></sec><sec id="s3-8-5"><title>About KNNs</title><p>The KNN classifier achieved a score of 0.78, suggesting a fair rank of accuracy in the diagnosis of positive and negative cases. According to the ROC curve for the KNN model, there is a moderate trade-off between the true positive rate (sensitivity) and the false positive rate (1 &#x2013; specificity). As well, there is some evidence to suggest that the KNN model has some ability to separate the 2 classes, but its shape suggests that it has room for improvement, as it does not consistently approach the top-left corner, which would indicate an ideal performance.</p><p>In a nutshell, all 4 models exhibit durable performance, with AUC values ranging from 0.78 to 0.80. The random forest model manifests as the best-performing classifier, followed closely by XGBoost, SVM, and KNN.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Through systematically integrating model comparison, advanced hyperparameter tuning, and interpretable feature selection techniques, we present a robust, interpretable framework for early prediabetes prediction. By combining SHAP analysis and LASSO regression, this research provides both high performance and transparency, compared to previous studies that focused solely on accuracy.</p></sec><sec id="s4-2"><title>Comparative Strengths and Limitations of Each Model</title><sec id="s4-2-1"><title>Overview</title><p>For prediabetes prediction, XGBoost, random forest, SVM, and KNN each show distinct strengths and weaknesses.</p></sec><sec id="s4-2-2"><title>Random Forest</title><p>In terms of overall discriminative ability, the random forest model accomplished a superior cross-validated ROC-AUC score (0.9117). According to this result, random forest is a robust choice for early detection scenarios as it can generalize to different datasets well. Due to its ability to prioritize recall through threshold adjustments, 22 false negatives were reduced, but false positives increased (230). In view of this trade-off, random forest may be highly powerful when the cost of missing a prediabetic case outweighs the risk of overdiagnosis.</p></sec><sec id="s4-2-3"><title>XGBoost</title><p>In evaluation, the XGBoost classifier showcased robust performance, as it attained a high precision score of 0.8128 and a balanced recall score. According to these metrics, it seems that XGBoost is particularly adept at minimizing false positives and false negatives, which is highly critical in clinical settings where diagnostic accuracy directly influences patient outcomes. The ROC-AUC score of XGBoost did not surpass that of random forest, despite its ability to balance sensitivity and specificity, making it a viable choice for routine clinical applications.</p></sec><sec id="s4-2-4"><title>About SVM</title><p>With an AUC of 0.78, the SVM model ranked behind both XGBoost and random forest. Despite their superior performance in high-dimensional spaces and in datasets with clear class separation, SVM models have limited linear separability in the prediabetes dataset, impacting their discriminative power. The model has a good ROC-AUC and <italic>F</italic><sub>1</sub>-score, with reasonable precision and recall, but when it comes to complex relationships, it lags behind the others. Optimizing feature engineering may upgrade its performance by searching alternative SVM kernels, combining nonlinear interactions, or incorporating alternative kernels.</p></sec><sec id="s4-2-5"><title>About KNNs</title><p>It performed rationally well in terms of classification performance but ranked lowest in terms of accuracy among the evaluated models, with an accuracy of 70.8% and ROC-AUC of 0.78. Because of its simplicity and reliance on distance metrics, KNN is expected to have lower discriminative power than more complex models such as random forest and XGBoost. This model may be valuable as a baseline model or may be convenient for small datasets with a focus on computational efficiency. The reasonable performance of KNN is a result of its sensitivity to distance metrics and the number of neighbors (<italic>k</italic>), which may prevent it from catching subtle differences in detecting normal and prediabetic cases. Thus, while KNN may be beneficial in straightforward scenarios, it does not have the same level of precision and recall as more sophisticated models.</p></sec></sec><sec id="s4-3"><title>Impact of Feature Selection</title><p>Feature selection played a crucial role in optimizing the models&#x2019; performance by focusing on the main relevant predictors. LASSO regression was used to characterize the prime features across models, with BMI, age, LDL-C, and HDL-C consistently emerging as important risk factors for prediabetes. In addition to improving the interpretability of the models, this approach also improved the predictive accuracy by reducing overfitting. The strict feature selection process warranted that the models stayed efficient while maintaining high classification power.</p></sec><sec id="s4-4"><title>Confusion Matrix and Threshold Analysis</title><p>The performance metrics were significantly influenced by adjusting decision thresholds, especially for random forest and XGBoost. A threshold adjustment in random forest minimized the risk of missed diagnoses by reducing false negatives (22 cases). Even so, this came at the expense of a boosted number of false positives (230 cases), suggesting a trade-off between recall and precision. XGBoost, while less sensitive to threshold changes, maintained a balanced approach, limiting both false positives and false negatives effectively. As a result of these outcomes, threshold tuning plays an important role in optimizing model performance for specific clinical applications, such as prioritizing recall in high-risk populations to avoid disease progression.</p></sec><sec id="s4-5"><title>Clinical Implications</title><p>The results suggest that XGBoost and random forest are the most promising models for enhancing prediabetes diagnosis, given their ability to generalize across different datasets and include reliable classification performance. The higher ROC-AUC score achieved over random forest (91.17%) reflects its potential for widespread use in clinical settings, especially where minimizing the risk of missed cases is crucial. The powerful performance of XGBoost among diverse metrics also highlights its practicality for routine screening, where both false positives and false negatives need to be minimized. By adjusting model thresholds, clinicians can customize diagnostic strategies to meet individual patient needs, such as increasing sensitivity for at-risk patients. Even though SVMs and KNNs do not outperform the best models, they still provide useful insights, especially when data dimensionality or simplicity are important factors.</p></sec><sec id="s4-6"><title>Conclusions</title><p>ML models, specifically random forest and XGBoost, have been found to be most sensitive to prediabetes risk assessment, and their performance has powerful discriminative power and high ROC-AUC scores. Combined with feature selection techniques such as LASSO regression, these models offer worthy insights into essential prediabetes predictors, such as BMI, age, and HDL-C. Based on the ROC and AUC analyses, all models&#x2014;XGBoost, SVM, random forest, and KNN&#x2014;are viable options for predicting prediabetes. Random forests are robust classifiers because of their ensemble nature, which reduces overfitting and enhances generalizability. SVM and XGBoost also produce competitive results, suggesting their classification abilities can be improved with further parameter tuning. With systematic exploratory data analysis and feature selection, these models can become reliable tools for detecting early prediabetes and offering pathways for optimizing them.</p><p>To confirm the generalizability of these models, future research should include validating them in diverse populations, adding biomarkers and genetics to improve prediction accuracy, and integrating these models into clinical decision support systems to assess risk in real time. These models contribute to more accurate and timely diagnosis of prediabetes, promoting timely intervention and ultimately improving health outcomes.</p></sec></sec></body><back><ack><p>The authors would like to thank the BIOCORE Research Group, the Center for Advanced Computing Technology (C-ACT), Fakulti Teknologi Maklumat dan Komunikasi (FTMK), and the Centre for Research and Innovation Management (CRIM), Universiti Teknikal Malaysia Melaka (UTeM), for providing the facilities and support for this research. All authors declared that they had insufficient funding to support open access publication of this manuscript, including from affiliated organizations or institutions, funding agencies, or other organizations. JMIR Publications provided article processing fee (APF) support for the publication of this article.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb2">HDL-C</term><def><p>high-density lipoprotein cholesterol</p></def></def-item><def-item><term id="abb3">KNN</term><def><p><italic>k</italic>-nearest neighbor</p></def></def-item><def-item><term id="abb4">LASSO</term><def><p>Least Absolute Shrinkage and Selection Operator</p></def></def-item><def-item><term id="abb5">LDL-C</term><def><p>low-density lipoprotein cholesterol</p></def></def-item><def-item><term id="abb6">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb7">PCA</term><def><p>principal component analysis</p></def></def-item><def-item><term id="abb8">ROC</term><def><p>receiver operating characteristic</p></def></def-item><def-item><term id="abb9">ROC-AUC</term><def><p>receiver operating characteristic area under the curve</p></def></def-item><def-item><term id="abb10">SHAP</term><def><p>Shapley Additive Explanations</p></def></def-item><def-item><term id="abb11">SMOTE</term><def><p>Synthetic Minority Oversampling Technique</p></def></def-item><def-item><term id="abb12">SVM</term><def><p>support vector machine</p></def></def-item><def-item><term id="abb13">XGBoost</term><def><p>extreme gradient boosting</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lou</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Performance of a prediabetes risk prediction model: a systematic review</article-title><source>Heliyon</source><year>2023</year><month>05</month><volume>9</volume><issue>5</issue><fpage>e15529</fpage><pub-id pub-id-type="doi">10.1016/j.heliyon.2023.e15529</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwartz</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Tseng</surname><given-names>E</given-names> </name><name name-style="western"><surname>Maruthur</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Rouhizadeh</surname><given-names>M</given-names> </name></person-group><article-title>Identification of prediabetes discussions in unstructured clinical documentation: validation of a natural language processing algorithm</article-title><source>JMIR Med Inform</source><year>2022</year><month>02</month><day>24</day><volume>10</volume><issue>2</issue><fpage>e29803</fpage><pub-id pub-id-type="doi">10.2196/29803</pub-id><pub-id pub-id-type="medline">35200154</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hathaway</surname><given-names>QA</given-names> </name><name name-style="western"><surname>Roth</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Pinti</surname><given-names>MV</given-names> </name><etal/></person-group><article-title>Machine-learning to stratify diabetic patients using novel cardiac biomarkers and integrative genomics</article-title><source>Cardiovasc Diabetol</source><year>2019</year><month>06</month><day>11</day><volume>18</volume><issue>1</issue><fpage>78</fpage><pub-id pub-id-type="doi">10.1186/s12933-019-0879-0</pub-id><pub-id pub-id-type="medline">31185988</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Silva</surname><given-names>K</given-names> </name><name name-style="western"><surname>J&#x00F6;nsson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Demmer</surname><given-names>RT</given-names> </name></person-group><article-title>A combined strategy of feature selection and machine learning to identify predictors of prediabetes</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>03</month><day>1</day><volume>27</volume><issue>3</issue><fpage>396</fpage><lpage>406</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocz204</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dwyer</surname><given-names>DB</given-names> </name><name name-style="western"><surname>Falkai</surname><given-names>P</given-names> </name><name name-style="western"><surname>Koutsouleris</surname><given-names>N</given-names> </name></person-group><article-title>Machine learning approaches for clinical psychology and psychiatry</article-title><source>Annu Rev Clin Psychol</source><year>2018</year><month>05</month><day>7</day><volume>14</volume><fpage>91</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.1146/annurev-clinpsy-032816-045037</pub-id><pub-id pub-id-type="medline">29401044</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Talari</surname><given-names>P</given-names> </name><name name-style="western"><surname>N</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kaur</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Hybrid feature selection and classification technique for early prediction and severity of diabetes type 2</article-title><source>PLoS ONE</source><year>2024</year><volume>19</volume><issue>1</issue><fpage>e0292100</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0292100</pub-id><pub-id pub-id-type="medline">38236900</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Q</given-names> </name><name name-style="western"><surname>He</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>Y</given-names> </name></person-group><article-title>Predicting the 2-year risk of progression from prediabetes to diabetes using machine learning among Chinese elderly adults</article-title><source>J Pers Med</source><year>2022</year><month>06</month><day>27</day><volume>12</volume><issue>7</issue><fpage>7</fpage><pub-id pub-id-type="doi">10.3390/jpm12071055</pub-id><pub-id pub-id-type="medline">35887552</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abbas</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mall</surname><given-names>R</given-names> </name><name name-style="western"><surname>Errafii</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Simple risk score to screen for prediabetes: a cross-sectional study from the Qatar Biobank cohort</article-title><source>J Diabetes Investig</source><year>2021</year><month>06</month><volume>12</volume><issue>6</issue><fpage>988</fpage><lpage>997</lpage><pub-id pub-id-type="doi">10.1111/jdi.13445</pub-id><pub-id pub-id-type="medline">33075216</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Han</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A nomogram model for predicting 5-year risk of prediabetes in Chinese adults</article-title><source>Sci Rep</source><year>2023</year><volume>13</volume><issue>1</issue><fpage>1</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1038/s41598-023-50122-3</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>F</given-names> </name><name name-style="western"><surname>Li</surname><given-names>YZ</given-names> </name><etal/></person-group><article-title>Development and validation of a risk assessment model for prediabetes in China national diabetes survey</article-title><source>World J Clin Cases</source><year>2022</year><month>11</month><day>16</day><volume>10</volume><issue>32</issue><fpage>11789</fpage><lpage>11803</lpage><pub-id pub-id-type="doi">10.12998/wjcc.v10.i32.11789</pub-id><pub-id pub-id-type="medline">36405266</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Tse</surname><given-names>TYE</given-names> </name><name name-style="western"><surname>Mak</surname><given-names>LI</given-names> </name><etal/></person-group><article-title>Non-laboratory-based risk assessment model for case detection of diabetes mellitus and pre-diabetes in primary care</article-title><source>J Diabetes Investig</source><year>2022</year><month>08</month><volume>13</volume><issue>8</issue><fpage>1374</fpage><lpage>1386</lpage><pub-id pub-id-type="doi">10.1111/jdi.13790</pub-id><pub-id pub-id-type="medline">35293149</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liaw</surname><given-names>LCM</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Goh</surname><given-names>PY</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>CP</given-names> </name></person-group><article-title>A histogram SMOTE-based sampling algorithm with incremental learning for imbalanced data classification</article-title><source>Inf Sci</source><year>2025</year><month>01</month><volume>686</volume><fpage>121193</fpage><pub-id pub-id-type="doi">10.1016/j.ins.2024.121193</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Raju</surname><given-names>VNG</given-names> </name><name name-style="western"><surname>Lakshmi</surname><given-names>KP</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>VM</given-names> </name><name name-style="western"><surname>Kalidindi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Padma</surname><given-names>V</given-names> </name></person-group><article-title>Study the influence of normalization/transformation process on the accuracy of supervised classification</article-title><conf-name>2020 Third International Conference on Smart Systems and Inventive Technology (ICSSIT)</conf-name><conf-date>Aug 20-22, 2020</conf-date><conf-loc>Tirunelveli, India</conf-loc><fpage>729</fpage><lpage>735</lpage><pub-id pub-id-type="doi">10.1109/ICSSIT48917.2020.9214160</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Da Poian</surname><given-names>V</given-names> </name><name name-style="western"><surname>Theiling</surname><given-names>B</given-names> </name><name name-style="western"><surname>Clough</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Exploratory data analysis (EDA) machine learning approaches for ocean world analog mass spectrometry</article-title><source>Front Astron Space Sci</source><year>2023</year><volume>10</volume><issue>May</issue><fpage>1</fpage><lpage>17</lpage><pub-id pub-id-type="doi">10.3389/fspas.2023.1134141</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saxena</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sampada</surname><given-names>GC</given-names> </name></person-group><article-title>A novel approach for feature selection and classification of diabetes mellitus: machine learning methods</article-title><source>Comput Intell Neurosci</source><year>2022</year><volume>2022</volume><fpage>3820360</fpage><pub-id pub-id-type="doi">10.1155/2022/3820360</pub-id><pub-id pub-id-type="medline">35463255</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noaro</surname><given-names>G</given-names> </name><name name-style="western"><surname>Cappon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Vettoretti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sparacino</surname><given-names>G</given-names> </name><name name-style="western"><surname>Favero</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Facchinetti</surname><given-names>A</given-names> </name></person-group><article-title>Machine-learning based model to improve insulin bolus calculation in type 1 diabetes therapy</article-title><source>IEEE Trans Biomed Eng</source><year>2021</year><month>01</month><volume>68</volume><issue>1</issue><fpage>247</fpage><lpage>255</lpage><pub-id pub-id-type="doi">10.1109/TBME.2020.3004031</pub-id><pub-id pub-id-type="medline">32746033</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gollapalli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Alansari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alkhorasani</surname><given-names>H</given-names> </name><etal/></person-group><article-title>A novel stacking ensemble for detecting three types of diabetes mellitus using a Saudi Arabian dataset: pre-diabetes, T1DM, and T2DM</article-title><source>Comput Biol Med</source><year>2022</year><volume>147</volume><fpage>105757</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.105757</pub-id><pub-id pub-id-type="medline">35777087</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jia</surname><given-names>W</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lian</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>S</given-names> </name></person-group><article-title>Feature dimensionality reduction: a review</article-title><source>Complex Intell Syst</source><year>2022</year><month>06</month><volume>8</volume><issue>3</issue><fpage>2663</fpage><lpage>2693</lpage><pub-id pub-id-type="doi">10.1007/s40747-021-00637-x</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hasan</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Alam</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Das</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hossain</surname><given-names>E</given-names> </name><name name-style="western"><surname>Hasan</surname><given-names>M</given-names> </name></person-group><article-title>Diabetes prediction using ensembling of different machine learning classifiers</article-title><source>IEEE Access</source><year>2020</year><volume>8</volume><fpage>76516</fpage><lpage>76531</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2020.2989857</pub-id><pub-id pub-id-type="medline">34812373</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>CF</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>IC</given-names> </name><etal/></person-group><article-title>Machine learning prediction of prediabetes in a young male Chinese cohort with 5.8-year follow-up</article-title><source>Diagnostics (Basel)</source><year>2024</year><month>05</month><day>8</day><volume>14</volume><fpage>10</fpage><pub-id pub-id-type="doi">10.3390/diagnostics14100979</pub-id><pub-id pub-id-type="medline">38786280</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alzyoud</surname><given-names>M</given-names> </name><name name-style="western"><surname>Alazaidah</surname><given-names>R</given-names> </name><name name-style="western"><surname>Aljaidi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Diagnosing diabetes mellitus using machine learning techniques</article-title><source>Int J Data Netw Sci</source><year>2024</year><volume>8</volume><fpage>179</fpage><lpage>188</lpage><pub-id pub-id-type="doi">10.5267/j.ijdns.2023.10.006</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Olisah</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>L</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>M</given-names> </name></person-group><article-title>Diabetes mellitus prediction and diagnosis from a data preprocessing and machine learning perspective</article-title><source>Comput Methods Programs Biomed</source><year>2022</year><month>06</month><volume>220</volume><fpage>106773</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2022.106773</pub-id><pub-id pub-id-type="medline">35429810</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Diranisha</surname><given-names>V</given-names> </name><name name-style="western"><surname>Triayudi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Komalasari</surname><given-names>RT</given-names> </name></person-group><article-title>Implementation of k-nearest neighbour (KNN) algorithm and random forest algorithm in identifying diabetes</article-title><source>SAGA J Technol Inform Syst</source><year>2024</year><volume>2</volume><issue>2</issue><fpage>234</fpage><lpage>244</lpage><pub-id pub-id-type="doi">10.58905/saga.v2i2.253</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yennimar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rasid</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kenedy</surname><given-names>S</given-names> </name></person-group><article-title>Implementation of support vector machine algorithm with hyper-tuning randomized search in stroke prediction</article-title><source>J Sist Inf Ilmu Komput Prima</source><year>2023</year><volume>6</volume><issue>2</issue><fpage>61</fpage><lpage>65</lpage><pub-id pub-id-type="doi">10.34012/jurnalsisteminformasidanilmukomputer.v6i2.3479</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yates</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Aandahl</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Richards</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Brook</surname><given-names>BW</given-names> </name></person-group><article-title>Cross validation for model selection: a review with examples from ecology</article-title><source>Ecol Monogr</source><year>2023</year><month>02</month><volume>93</volume><issue>1</issue><fpage>e1557</fpage><pub-id pub-id-type="doi">10.1002/ecm.1557</pub-id></nlm-citation></ref></ref-list></back></article>