<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JBB</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Bioinform Biotech</journal-id>
      <journal-title>JMIR Bioinformatics and Biotechnology</journal-title>
      <issn pub-type="epub">2563-3570</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v3i1e29404</article-id>
      <article-id pub-id-type="pmid"/>
      <article-id pub-id-type="doi">10.2196/29404</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Prediction of Antibody-Antigen Binding via Machine Learning: Development of Data Sets and Evaluation of Methods</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Qiu</surname>
            <given-names>Zheng</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Xiao</surname>
            <given-names>Yiling</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ackerman</surname>
            <given-names>Margaret E</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sundaramoorthi</surname>
            <given-names>Hemalatha</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Ye</surname>
            <given-names>Chao</given-names>
          </name>
          <degrees>MIS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5740-8531</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>Wenxing</given-names>
          </name>
          <degrees>MIT</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3041-3573</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Gaeta</surname>
            <given-names>Bruno</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>School of Computer Science and Engineering</institution>
            <institution>The University of New South Wales</institution>
            <addr-line>Computer Science Building (K17)</addr-line>
            <addr-line>Engineering Rd, UNSW</addr-line>
            <addr-line>Sydney, 2052</addr-line>
            <country>Australia</country>
            <phone>61 293857213</phone>
            <email>bgaeta@unsw.edu.au</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4723-4982</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Computer Science and Engineering</institution>
        <institution>The University of New South Wales</institution>
        <addr-line>Sydney</addr-line>
        <country>Australia</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Computer Science</institution>
        <institution>School of Information Science and Technology</institution>
        <institution>Tokyo Institute of Technology</institution>
        <addr-line>Tokyo</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Bruno Gaeta <email>bgaeta@unsw.edu.au</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Jan-Dec</season>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>28</day>
        <month>10</month>
        <year>2022</year>
      </pub-date>
      <volume>3</volume>
      <issue>1</issue>
      <elocation-id>e29404</elocation-id>
      <history>
        <date date-type="received">
          <day>6</day>
          <month>4</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>5</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>23</day>
          <month>9</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>18</day>
          <month>10</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Chao Ye, Wenxing Hu, Bruno Gaeta. Originally published in JMIR Bioinformatics and Biotechnology (https://bioinform.jmir.org), 28.10.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Bioinformatics and Biotechnology, is properly cited. The complete bibliographic information, a link to the original publication on https://bioinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://bioinform.jmir.org/2022/1/e29404" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The mammalian immune system is able to generate antibodies against a huge variety of antigens, including bacteria, viruses, and toxins. The ultradeep DNA sequencing of rearranged immunoglobulin genes has considerable potential in furthering our understanding of the immune response, but it is limited by the lack of a high-throughput, sequence-based method for predicting the antigen(s) that a given immunoglobulin recognizes.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>As a step toward the prediction of antibody-antigen binding from sequence data alone, we aimed to compare a range of machine learning approaches that were applied to a collated data set of antibody-antigen pairs in order to predict antibody-antigen binding from sequence data.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Data for training and testing were extracted from the Protein Data Bank and the Coronavirus Antibody Database, and additional antibody-antigen pair data were generated by using a molecular docking protocol. Several machine learning methods, including the weighted nearest neighbor method, the nearest neighbor method with the BLOSUM62 matrix, and the random forest method, were applied to the problem.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The final data set contained 1157 antibodies and 57 antigens that were combined in 5041 antibody-antigen pairs. The best performance for the prediction of interactions was obtained by using the nearest neighbor method with the BLOSUM62 matrix, which resulted in around 82% accuracy on the full data set. These results provide a useful frame of reference, as well as protocols and considerations, for machine learning and data set creation in the prediction of antibody-antigen binding.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Several machine learning approaches were compared to predict antibody-antigen interaction from protein sequences. Both the data set (in CSV format) and the machine learning program (coded in Python) are freely available for download on GitHub.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>DNA sequencing</kwd>
        <kwd>DNA</kwd>
        <kwd>DNA sequence</kwd>
        <kwd>sequence data</kwd>
        <kwd>molecular biology</kwd>
        <kwd>genomic</kwd>
        <kwd>random forest</kwd>
        <kwd>nearest neighbor</kwd>
        <kwd>immunoglobulin</kwd>
        <kwd>genetics</kwd>
        <kwd>antibody-antigen binding</kwd>
        <kwd>antigen</kwd>
        <kwd>antibody</kwd>
        <kwd>structural biology</kwd>
        <kwd>machine learning</kwd>
        <kwd>protein modeling</kwd>
        <kwd>protein</kwd>
        <kwd>proteomic</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>DNA sequencing technologies are providing new insights into the immune response by allowing for the large-scale sequencing of rearranged immunoglobulin genes that are present in an individual [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. However, the applications of this approach are limited by the lack of methods for determining the antigen(s) to which a specific immunoglobulin (ie, one encoded by a given sequence) binds. Individual immunoglobulins can be tested experimentally at significant cost; however, the large-scale characterization of binding properties based on sequence data is currently impossible.</p>
      <p>Antigen binding is mediated by the complementarity-determining regions (CDRs) of an antibody, which are shared between heavy and light immunoglobulin chains. Computational methods for predicting antibody-antigen interactions that leverage structure prediction and docking have been proposed [<xref ref-type="bibr" rid="ref3">3</xref>]. However, the use of these methods requires knowledge of the 3D structures of antibodies and antigens. The direct prediction of antibody-antigen interactions from protein sequences remains an open problem.</p>
      <p>Machine learning–based tools, such as mCSM-AB [<xref ref-type="bibr" rid="ref4">4</xref>] and ADAPT (Assisted Design of Antibody and Protein Therapeutics) [<xref ref-type="bibr" rid="ref5">5</xref>], have had some success in predicting antibody interactions in other contexts. mCSM-AB is a web server for predicting changes in antibody-antigen affinity upon mutation, using graph-based signatures. ADAPT is an affinity maturation platform that interleaves predictions and testing, and it has been previously validated on monoclonal antibodies.</p>
      <p>A more general method for predicting whether an antibody will bind to a protein antigen based on the antibody and antigen sequences remains elusive, in part due to the lack of comprehensive training data for the development of machine learning models. This study is intended as a first step toward this goal and aims to assemble a training data set from a range of sources and evaluate the feasibility of applying machine learning algorithms to identify the binding of antibody-antigen pairs in this data set.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Set</title>
        <p>Due to the scarcity of suitable antibody-antigen pairs, computational docking was used to generate some of the data in the training and testing data set. The ClusPro (Boston University) [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>] and Rosetta (RosettaCommons) [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref12">12</xref>] web servers  were used to create a data set of paired antibody-antigen complexes for machine learning. Both ClusPro and Rosetta were used for protein-protein molecular docking. Rosetta uses the SnugDock (RosettaCommons) algorithm [<xref ref-type="bibr" rid="ref10">10</xref>]. The Swiss-PdbViewer (Swiss Institute of Bioinformatics) [<xref ref-type="bibr" rid="ref13">13</xref>] was used to examine the resulting protein complex structures.</p>
        <p>A total of 50 antibody-antigen complexes were selected randomly from the Protein Data Bank (PDB) [<xref ref-type="bibr" rid="ref14">14</xref>]. The antibody-antigen complexes were separated by using a Perl script to produce PDB-formatted files as well as sequences for antibodies and antigens. CDRs were located by using the Rosetta antibody modeling web server. Antigens were docked with a range of antibodies by using ClusPro (used only to determine orientation), followed by Rosetta’s antibody docking program, SnugDock. In order to keep computation times manageable, not all antibodies were docked. Instead, 10 to 14 antibodies were randomly selected to be docked with each antigen in order to find the best orientation. The resulting complexes were submitted to the Rosetta SnugDock web server in order to calculate the best interface score. This produced structures for between 10 and 14 complexes per antigen, which, when added together with the original antibody-antigen complex, totaled 11 to 15 complexes per antigen. Altogether, 50 antigens were docked with 600 antibodies. An example of a resulting complex is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Example of a docking output. The 3s35 complex was generated by using the ClusPro server (docking results: "YES"; best docking interface score: −0.876).</p>
          </caption>
          <graphic xlink:href="bioinform_v3i1e29404_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The Rosetta interface scores were used as estimates of binding affinity in order to identify cognate antibody-antigen pairs to be used as input for machine learning. Complexes with interface scores of higher than −8.0 were classified outright as complexes with poor binding, and those with interface scores of lower than −9.0 were classified outright as complexes with good binding. For complexes with scores that ranged between −8.0 and −9.0, the docking clusters and positions were examined visually by using SwissDock (Swiss Institute of Bioinformatics). If the top 10 models had their antibodies and antigens in similar relative positions and the structures showed sensible interaction patterns, the pairs were classified as having a good binding affinity.</p>
        <p>Rosetta interface scores have been used previously as classifiers to determine binding affinity based on docking results (eg, in an antibody-antigen cross reactivity study [<xref ref-type="bibr" rid="ref15">15</xref>]).</p>
        <p>Additional data were extracted from the Coronavirus Antibody Database (CoV-AbDab) [<xref ref-type="bibr" rid="ref16">16</xref>]—a database of antibodies against coronaviruses, including SARS-CoV-2, SARS-CoV-1, and MERS-CoV (Middle East respiratory syndrome–related coronavirus). Data (2674 rows) were extracted from the CoV-AbDab on February 14, 2021. After filtering out incomplete data, 2031 rows remained, with each row corresponding to an antibody. The information extracted comprised the antibody names, their binding antigens, and their heavy and light variable region sequences, including the locations of the third CDRs (CDR3s). Each of the variable region sequences were searched against the international ImMunoGeneTics information system database [<xref ref-type="bibr" rid="ref17">17</xref>] in order to identify the locations of the first CDRs (CDR1s) and second CDRs (CDR2s) from the heavy and light chains. Since a row may contain information about an antibody's interactions with multiple antigens, the data were further split into multiple rows, with each row containing information about the interaction between 1 antibody and 1 antigen.</p>
        <p>Additional features were calculated for the sequences, as follows. The isoelectric point for each CDR was calculated by using the Bachem peptide calculator analysis tool (Bachem Holding AG) [<xref ref-type="bibr" rid="ref18">18</xref>]. The average hydrophilicity of each CDR was also calculated by using the Bachem peptide calculator.</p>
        <p>B cell epitopes were predicted by using the IEDB (Immune Epitope Database) antibody epitope prediction analysis tool [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>The resulting data set can be downloaded from GitHub [<xref ref-type="bibr" rid="ref20">20</xref>] and is structured with the following column headings: <italic>H chain CDR1 sequence</italic>, <italic>H chain CDR2 sequence</italic>, <italic>H chain CDR3 sequence</italic>, <italic>L chain CDR1 sequence</italic>, <italic>L chain CDR2 sequence</italic>, <italic>L chain CDR3 sequence</italic>, <italic>Hydrophilicity of L CDR1</italic>, <italic>pI of L CDR1</italic>, <italic>Hydrophilicity of L CDR2</italic>, <italic>pI of L CDR2</italic>, <italic>Hydrophilicity of L CDR3</italic>, <italic>pI of L CDR3</italic>, <italic>Hydrophilicity of H CDR1</italic>, <italic>pI of H CDR1, Hydrophilicity of H CDR2</italic>, <italic>pI of H CDR2</italic>, <italic>Hydrophilicity of H CDR3</italic>, <italic>pI of H CDR3</italic>, <italic>Antigen Epitope</italic>, <italic>Rosetta Docking score</italic>, <italic>Antigen</italic>, and <italic>Docking result</italic>.</p>
      </sec>
      <sec>
        <title>Machine Learning</title>
        <p>A weighted K-nearest neighbor (K-NN) classification algorithm [<xref ref-type="bibr" rid="ref21">21</xref>] for predicting antibody-antigen binding affinity was implemented in Python. The program can be downloaded from GitHub [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>For each antigen, the 11 to 15 antibodies that were docked were labeled as “good affinity” or “low affinity,” on the basis of the docking results. Machine learning was then performed, using the sequences of both antigens and antibodies.</p>
        <p>Neighbors were determined by using the string distances between the CDR1, CDR2, and CDR3 amino acid sequences of different antibodies. Weights were calculated from distances, so that nearer neighbors were considered to have more weight, as detailed below.</p>
        <p>For every antigen, the class (good affinity or low affinity) was learned by using the K-NN method, using a training subset (N − 1) of the labeled antigen-antibody sequence pairs and using the CDR string distances as features. The model performance was then evaluated on the remaining antigen-antibody sequence pair that was not used for training (leave-one-out cross-validation).</p>
        <p>In order to ensure that the K-NN pairs only included pairs with the same antigen, a fixed penalty of 1000 was added to the distances between antibody-antigen pairs involving different antigens.</p>
        <p>The similarity between antibodies was measured via a comparison of their CDRs. Each antibody has a heavy chain and a light chain, and each chain contains 3 CDRs. The distance between 2 antibodies was calculated as the Euclidean distance between their CDR distance vectors, as shown in the following equation (equation 1):</p>
        <p>
          <disp-formula>
            <graphic xlink:href="bioinform_v3i1e29404_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </p>
        <p>where (<italic>q<sub>i</sub></italic> – <italic>p<sub>i</sub></italic>) represents the string distance between the <italic>CDR<sub>i</sub></italic> of antibody <italic>q</italic> and the <italic>CDR<sub>i</sub></italic> of antibody <italic>p</italic>.</p>
        <p>The Python code is given in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>Two different CDR distance calculation methods were tested and compared; one was based on sequence identity, and the other used the BLOSUM62 matrix, as detailed below.</p>
        <p>For the identity-based distance measure, pairs of equivalent CDRs were compared with each other based on their Levenshtein string distances [<xref ref-type="bibr" rid="ref22">22</xref>], as shown in the following equation (equation 2):</p>
        <p>
          <disp-formula>
            <graphic xlink:href="bioinform_v3i1e29404_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </p>
        <p><italic>Cost</italic>=0 for <italic>a<sub>i</sub></italic>=<italic>b<sub>i</sub></italic>, <italic>Cost</italic>=1 for <italic>a<sub>i</sub></italic>≠<italic>b<sub>i</sub></italic></p>
        <p>The Levenshtein distance only accounts for amino acid identity when it is used for comparing sequences. A more biologically significant distance measure needs to take into account the different properties of amino acids, which means that some amino acid substitutions are more likely to be accepted in an interaction than others. The BLOSUM62 substitution matrix [<xref ref-type="bibr" rid="ref23">23</xref>] was used as a proxy for amino acid similarity in the Levenshtein distance calculation. Although the BLOSUM matrices were designed to reflect evolutionary conservation, they can provide an estimate of similarity in interaction potential [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>The Levenshtein distance was calculated as per equation 2, using the following cost function:</p>
        <p>For <italic>a<sub>i</sub></italic>=<italic>b<sub>i</sub></italic>, <italic>Cost</italic>=0</p>
        <p>
          <disp-formula>
            <graphic xlink:href="bioinform_v3i1e29404_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </p>
        <p>where <italic>S<sub>ij</sub></italic>, <italic>S<sub>ii</sub></italic>, and <italic>S<sub>jj</sub></italic> are obtained from the BLOSUM62 matrix.</p>
        <p>The following columns from the data set were used to train the model for leave-one-out cross-validation: <italic>H chain CDR1 sequence</italic>, <italic>H chain CDR2 sequence</italic>, <italic>H chain CDR3 sequence</italic>, <italic>L chain CDR1 sequence</italic>, <italic>L chain CDR2 sequence</italic>, <italic>L chain CDR3 sequence</italic>, <italic>Antigen</italic>, and <italic>Docking result</italic>. The trained model was then evaluated on its ability to predict the docking results from the other columns.</p>
        <p>A random forest machine learning algorithm incorporating the previous K-NN results was also used for predicting antibody-antigen binding classification. The isoelectric point and net charge at neutral pH (7.0) for each CDR were used as additional features, in addition to the BLOSUM62-derived CDR distances, for training the random forest. Binding was predicted by combining the votes from each of the features, and each individual feature contributed 1 vote, according to the nearest neighbor predictions based on each feature.</p>
        <p>The following columns from the data set were used for training the random forest: <italic>String distance (calculate by KNN method)</italic>, <italic>Hydrophilicity of L CDR1</italic>, <italic>pI of L CDR1</italic>, <italic>Hydrophilicity of L CDR2</italic>, <italic>pI of L CDR2</italic>, <italic>Hydrophilicity of L CDR3</italic>, <italic>pI of L CDR3</italic>, <italic>Hydrophilicity of H CDR1</italic>, <italic>pI of H CDR1</italic>, <italic>Hydrophilicity of H CDR2</italic>, <italic>pI of H CDR2</italic>, <italic>Hydrophilicity of H CDR3</italic>, <italic>pI of H CDR3</italic>, <italic>Antigen</italic>, and <italic>Docking result</italic>. The trained model was then evaluated on its ability to predict the docking results from the other columns.</p>
        <p>Each feature was considered as an individual decision tree and contributed 1 vote. For example, the isoelectric point of the CDR1 of an antibody’s heavy chain was considered as 1 feature, and the K-NN method was used, as previously described, to find the results of this decision tree. Altogether, there were 13 decision trees, and each tree used the K-NN method to determine its vote, for a total of 13 votes. The final decision was determined based on a simple majority vote. The best results were obtained when the whole forest (all 13 decision trees) took part in the vote.</p>
        <p>The performance of the K-NN and random forest learners was evaluated by using leave-one-out cross-validation on an antigen basis. For each of the 57 antigens, a training data set was constructed by removing 1 row, that is, 1 antibody-antigen pair, from the data set. After training with the remaining antibodies that bound to this antigen, model performance was evaluated based on the removed antibody. The process was repeated until all 5041 antibody-antigen pairs were tested. Model accuracy was calculated as the ratio of the number of correctly predicted antibody-antigen pairs over the total number of pairs in the data set.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Data Set</title>
        <p>A total of 600 antibody-antigen complexes were generated via the computational docking of 50 antibody structures with 50 antigen structures. In addition, a total of 4441 antibody-antigen pairs were extracted from the Cov-AbDab. The composition of this section of the data set is shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p>
        <p>In total, the data set contained 5041 antibody-antigen pairs comprising 1157 antibodies and 57 antigens.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Number of antibodies and positive and negative antibody-antigen pairs extracted from the Coronavirus Antibody Database.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="180"/>
            <col width="200"/>
            <col width="200"/>
            <col width="420"/>
            <thead>
              <tr valign="top">
                <td>Antigen</td>
                <td>Number of antibodies</td>
                <td>Positive samples, n</td>
                <td>Negative samples, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>SARS-CoV-2</td>
                <td>1943</td>
                <td>1912</td>
                <td>31</td>
              </tr>
              <tr valign="top">
                <td>SARS-CoV-1</td>
                <td>1241</td>
                <td>597</td>
                <td>644</td>
              </tr>
              <tr valign="top">
                <td>MERS-CoV<sup>a</sup></td>
                <td>264</td>
                <td>119</td>
                <td>145</td>
              </tr>
              <tr valign="top">
                <td>HCoV-OC43<sup>b</sup></td>
                <td>257</td>
                <td>21</td>
                <td>236</td>
              </tr>
              <tr valign="top">
                <td>HCoV-HKU1<sup>c</sup></td>
                <td>254</td>
                <td>84</td>
                <td>170</td>
              </tr>
              <tr valign="top">
                <td>HCoV-NL63<sup>d</sup></td>
                <td>258</td>
                <td>51</td>
                <td>207</td>
              </tr>
              <tr valign="top">
                <td>HCoV-229E<sup>e</sup></td>
                <td>207</td>
                <td>49</td>
                <td>158</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>MERS-CoV: Middle East respiratory syndrome–related coronavirus.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>HCoV-OC43: human coronavirus OC43.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>HCoV-HKU1: human coronavirus HKU1.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>HCoV-NL63: human coronavirus NL63.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>HCoV-229E: human coronavirus 229E.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Machine Learning</title>
        <p>The antigen-antibody binding classification methods were evaluated by using leave-one-out cross-validation. For a K value of 2 nearest neighbors, the K-NN method, when the Levenshtein distance was calculated based on sequence identity, achieved an accuracy of 81%. A slight improvement (accuracy of 82%) was observed when using the BLOSUM62 matrix to calculate the Levenshtein string distance.</p>
        <p>Different K values were also evaluated when the Levenshtein distance was calculated based on the BLOSUM62 matrix. A K value of 2 provided the best accuracy. For a K value of 1 nearest neighbor, the accuracy was 80%. For a K value of 3, classification accuracy dropped to 79%.</p>
        <p>For the random forest predictions, votes were used as the classification prediction results. The accuracy was highest when the whole forest was considered, in which case each feature contributed to the classification results. The performance of the random forest method was best (accuracy of 80%) when all 13 features—the Levenshtein string distance and the isoelectric point and net charge at neutral pH (7.0) for each CDR—took part in the final votes.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>We created a training and test data set of 5041 antibody-antigen complexes by using a combination of structure modeling and computational docking via Rosetta, together with antibody-antigen pairs extracted from the CoV-AbDab.</p>
      <p>We also developed weighted nearest neighbor and random forest approaches to predict antibody-antigen binding based on sequence data. These machine learning procedures can perform classifications to identify antigens that are likely to bind to a given antibody.</p>
      <p>Leave-one-out cross-validation testing yielded an accuracy of 82% for classification results that were based on 2 nearest neighbors. The prediction accuracy ranged from around 77% to 82% when varying the number of nearest neighbors. The best prediction results (accuracy of 82%) were obtained with 2 nearest neighbors, using string distance and BLOSUM62 matrices.</p>
      <p>This study demonstrates that the interaction between an antibody and a protein antigen can be predicted from the amino acid sequences of both the antibody’s variable regions and the antigen by using a relatively simple machine learning approach. Compared to the docking prediction method, which is based on the spatial protein structure, the method proposed in this project does not require a 3D structure and is more suitable for antibodies for which a 3D structure is unavailable.</p>
      <p>In the absence of large amounts of experimental data on antibody-antigen binding affinities, the Rosetta interface scores, along with the top 10 binding positions, were used to determine the classification for binding affinity. Although this method was unlikely to provide a full representation of the problem, it provided a data set suitable for comparing a range of approaches. This method will certainly improve as larger data sets become available. The docking data set contained 600 rows of antibody-antigen pairs. Subsets of this data set (200, 300, 400, and 500 rows) were tested during the data collection process. Classification accuracy was quite consistent across all of these subsets. This indicates that while the data set is limited, it provides a good starting point for the development of our approach for the prediction of antibody-antigen binding affinity, which can be further validated as more data become available.
The K-NN method was chosen as the initial machine learning method. The best prediction results were obtained with 2 nearest neighbors (K=2). Random forests were also used that incorporated sequence distance as well as the chemical properties of CDRs (isoelectric point and hydrophobicity). The best prediction results (accuracy of 82%) were obtained with the nearest neighbor method when the Levenshtein distance was calculated based on BLOSUM62 matrices. The additional features included in the random forest did not improve classification accuracy, and this was probably due to these features’ dependence on the amino acid sequences.</p>
      <p>Around 20% (907/5041, 18%) of our method’s predictions were inaccurate. These errors mostly occurred with some large antigens. The docking results for these antigens were further examined. The decreased accuracy was likely the result of conformational flexibility in the larger antigens, the presence of multiple epitopes, and the higher number of discontinuous epitopes in larger antigens relative to the number of such epitopes in smaller antigens.</p>
      <p>As a step toward the development of a machine learning method suitable for predicting antibody-antigen binding affinities from sequence data, the weighted nearest neighbor and random forest machine learning approaches were applied to the problem. The basic hypothesis was that antibodies with similar sequences may be similar in terms of their ability to bind to a given antigen. A prediction program was coded in Python and evaluated via cross-validation on a data set containing 1157 antibodies and 57 antigens that were combined in 5041 antibody-antigen pairs. The best classification prediction accuracy was around 82% for this data set.</p>
      <p>These results provide a useful frame of reference, as well as protocols and considerations, for machine learning and data set creation in the prediction of antibody-antigen binding. Our method is still limited due to the scarcity of training data, but its usefulness for large-scale prediction should increase as more antibody-antigen binding data become available. The ability to predict antibody-antigen binding will allow for a more informed use of data from large-scale immune receptor sequencing. This, in turn, will increase our understanding of the variation in antigen recognition in an organism over time, under a range of conditions and between individuals and populations.</p>
      <p>Both the data set (in CSV format) and the machine learning program (coded in Python) are freely available for download on GitHub [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Python code for Euclidean distance calculation.</p>
        <media xlink:href="bioinform_v3i1e29404_app1.docx" xlink:title="DOCX File , 12 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ADAPT</term>
          <def>
            <p>Assisted Design of Antibody and Protein Therapeutics</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDR</term>
          <def>
            <p>complementarity-determining region</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CDR1</term>
          <def>
            <p>first complementarity-determining region</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CDR2</term>
          <def>
            <p>second complementarity-determining region</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">CDR3</term>
          <def>
            <p>third complementarity-determining region</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">CoV-AbDab</term>
          <def>
            <p>Coronavirus Antibody Database</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">IEDB</term>
          <def>
            <p>Immune Epitope Database</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">K-NN</term>
          <def>
            <p>K-nearest neighbor</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MERS-CoV</term>
          <def>
            <p>Middle East respiratory syndrome–related coronavirus</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">PDB</term>
          <def>
            <p>Protein Data Bank</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dunn-Walters</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Townsend</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sinclair</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Immunoglobulin gene analysis as a tool for investigating human immune responses</article-title>
          <source>Immunol Rev</source>
          <year>2018</year>
          <month>07</month>
          <volume>284</volume>
          <issue>1</issue>
          <fpage>132</fpage>
          <lpage>147</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29944755"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/imr.12659</pub-id>
          <pub-id pub-id-type="medline">29944755</pub-id>
          <pub-id pub-id-type="pmcid">PMC6033188</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Crowe</surname>
              <given-names>JE Jr</given-names>
            </name>
          </person-group>
          <article-title>Deep sequencing and human antibody repertoire analysis</article-title>
          <source>Curr Opin Immunol</source>
          <year>2016</year>
          <month>06</month>
          <volume>40</volume>
          <fpage>103</fpage>
          <lpage>109</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27065089"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.coi.2016.03.008</pub-id>
          <pub-id pub-id-type="medline">27065089</pub-id>
          <pub-id pub-id-type="pii">S0952-7915(16)30025-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC5203765</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weitzner</surname>
              <given-names>BD</given-names>
            </name>
            <name name-style="western">
              <surname>Jeliazkov</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Lyskov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Marze</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kuroda</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Frick</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Adolf-Bryfogle</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Biswas</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Dunbrack</surname>
              <given-names>RL Jr</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <article-title>Modeling and docking of antibody structures with Rosetta</article-title>
          <source>Nat Protoc</source>
          <year>2017</year>
          <month>02</month>
          <volume>12</volume>
          <issue>2</issue>
          <fpage>401</fpage>
          <lpage>416</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28125104"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/nprot.2016.180</pub-id>
          <pub-id pub-id-type="medline">28125104</pub-id>
          <pub-id pub-id-type="pii">nprot.2016.180</pub-id>
          <pub-id pub-id-type="pmcid">PMC5739521</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pires</surname>
              <given-names>DEV</given-names>
            </name>
            <name name-style="western">
              <surname>Ascher</surname>
              <given-names>DB</given-names>
            </name>
          </person-group>
          <article-title>mCSM-AB: a web server for predicting antibody-antigen affinity changes upon mutation with graph-based signatures</article-title>
          <source>Nucleic Acids Res</source>
          <year>2016</year>
          <month>07</month>
          <day>08</day>
          <volume>44</volume>
          <issue>W1</issue>
          <fpage>W469</fpage>
          <lpage>W473</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27216816"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkw458</pub-id>
          <pub-id pub-id-type="medline">27216816</pub-id>
          <pub-id pub-id-type="pii">gkw458</pub-id>
          <pub-id pub-id-type="pmcid">PMC4987957</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vivcharuk</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Baardsnes</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Deprez</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Sulea</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jaramillo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Corbeil</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Mullick</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Magoon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Marcil</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Durocher</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>O'Connor-McCourt</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Purisima</surname>
              <given-names>EO</given-names>
            </name>
          </person-group>
          <article-title>Assisted Design of Antibody and Protein Therapeutics (ADAPT)</article-title>
          <source>PLoS One</source>
          <year>2017</year>
          <month>07</month>
          <day>27</day>
          <volume>12</volume>
          <issue>7</issue>
          <fpage>e0181490</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0181490"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0181490</pub-id>
          <pub-id pub-id-type="medline">28750054</pub-id>
          <pub-id pub-id-type="pii">PONE-D-17-00252</pub-id>
          <pub-id pub-id-type="pmcid">PMC5531539</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kozakov</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>DR</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Porter</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Padhorny</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yueh</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Beglov</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Vajda</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The ClusPro web server for protein-protein docking</article-title>
          <source>Nat Protoc</source>
          <year>2017</year>
          <month>02</month>
          <volume>12</volume>
          <issue>2</issue>
          <fpage>255</fpage>
          <lpage>278</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28079879"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/nprot.2016.169</pub-id>
          <pub-id pub-id-type="medline">28079879</pub-id>
          <pub-id pub-id-type="pii">nprot.2016.169</pub-id>
          <pub-id pub-id-type="pmcid">PMC5540229</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Comeau</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Gatchell</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Vajda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Camacho</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>ClusPro: an automated docking and discrimination method for the prediction of protein complexes</article-title>
          <source>Bioinformatics</source>
          <year>2004</year>
          <month>01</month>
          <day>01</day>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>45</fpage>
          <lpage>50</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btg371</pub-id>
          <pub-id pub-id-type="medline">14693807</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kozakov</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Brenke</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Comeau</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Vajda</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>PIPER: an FFT-based protein docking program with pairwise potentials</article-title>
          <source>Proteins</source>
          <year>2006</year>
          <month>11</month>
          <day>01</day>
          <volume>65</volume>
          <issue>2</issue>
          <fpage>392</fpage>
          <lpage>406</lpage>
          <pub-id pub-id-type="doi">10.1002/prot.21117</pub-id>
          <pub-id pub-id-type="medline">16933295</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Comeau</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Gatchell</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Vajda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Camacho</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>ClusPro: a fully automated algorithm for protein-protein docking</article-title>
          <source>Nucleic Acids Res</source>
          <year>2004</year>
          <month>07</month>
          <day>01</day>
          <volume>32</volume>
          <issue>Web Server issue</issue>
          <fpage>W96</fpage>
          <lpage>W99</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/15215358"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkh354</pub-id>
          <pub-id pub-id-type="medline">15215358</pub-id>
          <pub-id pub-id-type="pii">32/suppl_2/W96</pub-id>
          <pub-id pub-id-type="pmcid">PMC441492</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sircar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <article-title>SnugDock: paratope structural optimization during antibody-antigen docking compensates for errors in antibody homology models</article-title>
          <source>PLoS Comput Biol</source>
          <year>2010</year>
          <month>01</month>
          <day>22</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>e1000644</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pcbi.1000644"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1000644</pub-id>
          <pub-id pub-id-type="medline">20098500</pub-id>
          <pub-id pub-id-type="pmcid">PMC2800046</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lyskov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <article-title>The RosettaDock server for local protein-protein docking</article-title>
          <source>Nucleic Acids Res</source>
          <year>2008</year>
          <month>07</month>
          <day>01</day>
          <volume>36</volume>
          <issue>Web Server issue</issue>
          <fpage>W233</fpage>
          <lpage>W238</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/18442991"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkn216</pub-id>
          <pub-id pub-id-type="medline">18442991</pub-id>
          <pub-id pub-id-type="pii">gkn216</pub-id>
          <pub-id pub-id-type="pmcid">PMC2447798</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lyskov</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>FC</given-names>
            </name>
            <name name-style="western">
              <surname>Conchúir</surname>
              <given-names>SÓ</given-names>
            </name>
            <name name-style="western">
              <surname>Der</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Drew</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kuroda</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weitzner</surname>
              <given-names>BD</given-names>
            </name>
            <name name-style="western">
              <surname>Renfrew</surname>
              <given-names>PD</given-names>
            </name>
            <name name-style="western">
              <surname>Sripakdeevong</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Borgo</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Havranek</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhlman</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kortemme</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bonneau</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Das</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Serverification of molecular modeling applications: the Rosetta Online Server that Includes Everyone (ROSIE)</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <month>05</month>
          <day>22</day>
          <volume>8</volume>
          <issue>5</issue>
          <fpage>e63906</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0063906"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0063906</pub-id>
          <pub-id pub-id-type="medline">23717507</pub-id>
          <pub-id pub-id-type="pii">PONE-D-13-06189</pub-id>
          <pub-id pub-id-type="pmcid">PMC3661552</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guex</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Peitsch</surname>
              <given-names>MC</given-names>
            </name>
          </person-group>
          <article-title>SWISS-MODEL and the Swiss-PdbViewer: an environment for comparative protein modeling</article-title>
          <source>Electrophoresis</source>
          <year>1997</year>
          <month>12</month>
          <volume>18</volume>
          <issue>15</issue>
          <fpage>2714</fpage>
          <lpage>2723</lpage>
          <pub-id pub-id-type="doi">10.1002/elps.1150181505</pub-id>
          <pub-id pub-id-type="medline">9504803</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <article-title>RCSB PDB: Homepage</article-title>
          <source>RCSB Protein Data Bank</source>
          <access-date>2019-07-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.rcsb.org/">https://www.rcsb.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kilambi</surname>
              <given-names>KP</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <article-title>Structure-based cross-docking analysis of antibody-antigen interactions</article-title>
          <source>Sci Rep</source>
          <year>2017</year>
          <month>08</month>
          <day>15</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>8145</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-017-08414-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-017-08414-y</pub-id>
          <pub-id pub-id-type="medline">28811664</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-017-08414-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC5557897</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raybould</surname>
              <given-names>MIJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kovaltsuk</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Marks</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Deane</surname>
              <given-names>CM</given-names>
            </name>
          </person-group>
          <article-title>CoV-AbDab: The Coronavirus Antibody Database</article-title>
          <source>Bioinformatics</source>
          <year>2021</year>
          <month>05</month>
          <day>05</day>
          <volume>37</volume>
          <issue>5</issue>
          <fpage>734</fpage>
          <lpage>735</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32805021"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa739</pub-id>
          <pub-id pub-id-type="medline">32805021</pub-id>
          <pub-id pub-id-type="pii">5893556</pub-id>
          <pub-id pub-id-type="pmcid">PMC7558925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <article-title>IMGT home page</article-title>
          <source>The international ImMunoGeneTics information system</source>
          <access-date>2020-01-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.imgt.org">http://www.imgt.org</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <article-title>Peptide calculator</article-title>
          <source>Bachem</source>
          <access-date>2020-03-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bachem.com/knowledge-center/peptide-calculator/">https://www.bachem.com/knowledge-center/peptide-calculator/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>Antibody epitope prediction</article-title>
          <source>Immune Epitope Database</source>
          <access-date>2020-07-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://tools.iedb.org/bcell/">http://tools.iedb.org/bcell/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Chao Ye</collab>
          </person-group>
          <article-title>jessye123/ab-ag-seq-machine-learning</article-title>
          <source>GitHub</source>
          <access-date>2022-10-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/jessye123/ab-ag-seq-machine-learning">https://github.com/jessye123/ab-ag-seq-machine-learning</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taunk</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>De</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Verma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Swetapadma</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A brief review of nearest neighbor algorithm for learning and classification</article-title>
          <year>2019</year>
          <conf-name>2019 International Conference on Intelligent Computing and Control Systems (ICCS)</conf-name>
          <conf-date>May 15-17, 2019</conf-date>
          <conf-loc>Madurai, India</conf-loc>
          <fpage>1255</fpage>
          <lpage>1260</lpage>
          <pub-id pub-id-type="doi">10.1109/iccs45141.2019.9065747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Levenshtein</surname>
              <given-names>VI</given-names>
            </name>
          </person-group>
          <article-title>Binary codes capable of correcting deletions, insertions and reversals</article-title>
          <source>Soviet Physics Doklady</source>
          <year>1966</year>
          <month>02</month>
          <volume>10</volume>
          <issue>8</issue>
          <fpage>707</fpage>
          <lpage>710</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://nymity.ch/sybilhunting/pdf/Levenshtein1966a.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Henikoff</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Henikoff</surname>
              <given-names>JG</given-names>
            </name>
          </person-group>
          <article-title>Amino acid substitution matrices from protein blocks</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>1992</year>
          <month>11</month>
          <day>15</day>
          <volume>89</volume>
          <issue>22</issue>
          <fpage>10915</fpage>
          <lpage>10919</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/1438297"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.89.22.10915</pub-id>
          <pub-id pub-id-type="medline">1438297</pub-id>
          <pub-id pub-id-type="pmcid">PMC50453</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Yu-An</given-names>
            </name>
            <name name-style="western">
              <surname>You</surname>
              <given-names>Zhu-Hong</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>Xin</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>Leon</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Lirong</given-names>
            </name>
          </person-group>
          <article-title>Using Weighted Sparse Representation Model Combined with Discrete Cosine Transformation to Predict Protein-Protein Interactions from Protein Sequence</article-title>
          <source>Biomed Res Int</source>
          <year>2015</year>
          <volume>2015</volume>
          <fpage>902198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1155/2015/902198"/>
          </comment>
          <pub-id pub-id-type="doi">10.1155/2015/902198</pub-id>
          <pub-id pub-id-type="medline">26634213</pub-id>
          <pub-id pub-id-type="pmcid">PMC4641304</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
