<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JBB</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Bioinform Biotech</journal-id>
      <journal-title>JMIR Bioinformatics and Biotechnology</journal-title>
      <issn pub-type="epub">2563-3570</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v3i1e30890</article-id>
      <article-id pub-id-type="pmid"/>
      <article-id pub-id-type="doi">10.2196/30890</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>An Analysis of Different Distance-Linkage Methods for Clustering Gene Expression Data and Observing Pleiotropy: Empirical Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sengupta</surname>
            <given-names>Deepanwita</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhang</surname>
            <given-names>Rong</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Hetti Arachchilage</surname>
            <given-names>Madara</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Choudhury</surname>
            <given-names>Joydhriti</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9056-5192</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Ashraf</surname>
            <given-names>Faisal Bin</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Brac University</institution>
            <addr-line>Mohakhali</addr-line>
            <addr-line>Dhaka</addr-line>
            <country>Bangladesh</country>
            <phone>880 9617445125</phone>
            <email>faisal.ashraf@bracu.ac.bd</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4006-5389</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Brac University</institution>
        <addr-line>Dhaka</addr-line>
        <country>Bangladesh</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Faisal Bin Ashraf <email>faisal.ashraf@bracu.ac.bd</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Jan-Dec</season>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>17</day>
        <month>6</month>
        <year>2022</year>
      </pub-date>
      <volume>3</volume>
      <issue>1</issue>
      <elocation-id>e30890</elocation-id>
      <history>
        <date date-type="received">
          <day>2</day>
          <month>6</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>28</day>
          <month>8</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>10</day>
          <month>5</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>29</day>
          <month>5</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Joydhriti Choudhury, Faisal Bin Ashraf. Originally published in JMIR Bioinformatics and Biotechnology (https://bioinform.jmir.org), 17.06.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Bioinformatics and Biotechnology, is properly cited. The complete bibliographic information, a link to the original publication on https://bioinform.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://bioinform.jmir.org/2022/1/e30890" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large amounts of biological data have been generated over the last few decades, encouraging scientists to look for connections between genes that cause various diseases. Clustering illustrates such a relationship between numerous species and genes. Finding an appropriate distance-linkage metric to construct clusters from diverse biological data sets has thus become critical. Pleiotropy is also important for a gene’s expression to vary and create varied consequences in living things. Finding the pleiotropy of genes responsible for various diseases has become a major research challenge.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Our goal was to establish the optimal distance-linkage strategy for creating reliable clusters from diverse data sets and identifying the common genes that cause various tumors to observe genes with pleiotropic effect.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We considered 4 linking methods—single, complete, average, and ward—and 3 distance metrics—Euclidean, maximum, and Manhattan distance. For assessing the quality of different sets of clusters, we used a fitness function that combines silhouette width and within-cluster distance.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>According to our findings, the maximum distance measure produces the highest-quality clusters. Moreover, for medium data set, the average linkage method, and for large data set, the ward linkage method works best. The outcome is not improved by using ensemble clustering. We also discovered genes that cause 3 different cancers and used gene enrichment to confirm our findings.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Accuracy is crucial in clustering, and we investigated the accuracy of numerous clustering techniques in our research. Other studies may aid related works if the data set is similar to ours.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>gene clustering</kwd>
        <kwd>gene expression</kwd>
        <kwd>distance metric</kwd>
        <kwd>linkage method</kwd>
        <kwd>hierarchical clustering</kwd>
        <kwd>pleiotropy</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>A substantial amount of genetic data began to accumulate in the hands of bioinformatics experts at the turn of the 21st century. The process was sped by advances in technology hardware and improved computer algorithms. Scientists began storing all of this genomic information in sequential data [<xref ref-type="bibr" rid="ref1">1</xref>] and intensity matrix [<xref ref-type="bibr" rid="ref2">2</xref>] formats. Different types of sequences, such as protein, DNA, and RNA sequences, are kept in sequential data format, and the intensity matrix preserves gene behavior under various conditions. To record and analyze gene behavior on sample individuals, these conditions can vary under varied light intensities.</p>
      <p>Microarray [<xref ref-type="bibr" rid="ref3">3</xref>] is a type of intensity matrix in which each row represents a single gene, and each column indicates that gene’s behavior in a given situation. A microarray data set’s sample structure is shown in <xref ref-type="table" rid="table1">Table 1</xref>. Four genes express themselves at 3 different times or circumstances. Depending on the normalization approach used, the values stored in a microarray data set can be both positive and negative.</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Sample microarray data.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <col width="250"/>
          <thead>
            <tr valign="top">
              <td>Genes</td>
              <td>Time 1</td>
              <td>Time 2</td>
              <td>Time 3</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Gene 1</td>
              <td>0.25</td>
              <td>0.22</td>
              <td>0.65</td>
            </tr>
            <tr valign="top">
              <td>Gene 2</td>
              <td>–0.75</td>
              <td>1.25</td>
              <td>–0.63</td>
            </tr>
            <tr valign="top">
              <td>Gene 3</td>
              <td>0.05</td>
              <td>0.66</td>
              <td>0.75</td>
            </tr>
            <tr valign="top">
              <td>Gene 4</td>
              <td>1.25</td>
              <td>–0.52</td>
              <td>0.15</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <p>Researchers have been extracting valuable biological information from microarray data. The construction of a phylogenetic tree is one of the most extensively used methodologies [<xref ref-type="bibr" rid="ref4">4</xref>]. The evolutionary relationships between numerous species are shown by the phylogenetic tree. In the case of genes, it calculates gene similarity to create a gene tree that depicts how particular genes have evolved [<xref ref-type="bibr" rid="ref5">5</xref>]. Although phylogenetic trees are based on sequence data because mutations occur in any species’ genome sequence, genome sequences are comparatively large and need a lot of computing power and memory. Gene expression represents phenotypes of a gene, and different genes exhibit variable levels of expression under the same conditions [<xref ref-type="bibr" rid="ref6">6</xref>]. As a result, we can employ phenotype, which is a measurement of the genes’ reflection due to genotype differences. The expression level of genes calculates how near they are to one another using the microarray data set as an input, because the transcriptional activity of similar genes should be similar [<xref ref-type="bibr" rid="ref7">7</xref>]. A tree is built by connecting all closely related genes one by one, with each leaf representing a single gene and branches separating one group of genes from another [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. This hierarchical tree can aid in the creation of more precise groupings. It assists biologists in determining and comprehending the function of an unknown gene. As a result, developing appropriate metrics for clustering microarray data is a significant scientific challenge.</p>
      <p>Different clustering approaches have been presented to extract information from the microarray data set [<xref ref-type="bibr" rid="ref10">10</xref>]. Clustering algorithms divide unclassified data into distinct classified groups [<xref ref-type="bibr" rid="ref11">11</xref>], with the most comparable data points grouped together. As a result, if an unknown element belongs to a recognized cluster, it becomes easier for the researcher to forecast its properties. Clustering is a technique used in bioinformatics to organize microarray data and predict properties of unknown genes based on which cluster they belong to [<xref ref-type="bibr" rid="ref11">11</xref>]. Furthermore, bioinformatics workflow [<xref ref-type="bibr" rid="ref12">12</xref>] and immune repertoire profiling [<xref ref-type="bibr" rid="ref13">13</xref>] are classified using hierarchical clustering, a sort of clustering technique. It also has applications in the prediction of nonsmall cell lung cancer metastasis [<xref ref-type="bibr" rid="ref14">14</xref>], the high-confidence identification of B cell clones [<xref ref-type="bibr" rid="ref15">15</xref>], and the identification of cell type from a single cell transcriptome [<xref ref-type="bibr" rid="ref16">16</xref>]. It is also used to create a phylogenetic tree using microarray data [<xref ref-type="bibr" rid="ref15">15</xref>]. The hierarchical clustering methodology uses a distance algorithm to calculate the distance between distinct genes after inputting microarray data. The distance is then used to connect closely related genes in clusters using a linkage approach.</p>
      <p>Various distance methods are employed depending on the data set’s characteristics. The way the 2 distance methods determine the difference between 2 distant data points is the fundamental distinction between them. Euclidean [<xref ref-type="bibr" rid="ref17">17</xref>], Chebyshev [<xref ref-type="bibr" rid="ref18">18</xref>], and other distance approaches are common. After applying the distance approach, the hierarchical clustering technique connects related genes using several types of linking methods to form a cluster. single linkage method [<xref ref-type="bibr" rid="ref19">19</xref>], complete linkage method [<xref ref-type="bibr" rid="ref20">20</xref>], average linkage method [<xref ref-type="bibr" rid="ref20">20</xref>], and others are some of the most used linkage methods. Linkage methods connect genes in a bottom-up manner, eventually resulting in a hierarchical tree, often known as a phylogenetic tree. As computational ability and technology progress, it has become increasingly important to establish reliable clusters of related genes to understand unknown genes in sensitive domains such as health care and disease prediction.</p>
      <p>Pleiotropy is another key phenomenon identified in the investigation of gene functions behind many diseases. Pleiotropy occurs when a single gene influences many phenotypic features [<xref ref-type="bibr" rid="ref21">21</xref>]. There are numerous examples of multiple genes working together to cause a single disease [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>]. Furthermore, it appears that a single gene is responsible for several disorders [<xref ref-type="bibr" rid="ref25">25</xref>]. Even though we can identify diseases caused by the same gene, the gene’s impact on each disease is different. It may appear to be more active in some disorders than in others. As a result, we can visualize the impact of a gene on other diseases if we can detect commonalities in their expressions for different diseases and quantify the distance.</p>
      <p>In this work, we used a variety of data sets to investigate different distance-linkage combinations for hierarchical clustering. These clusters have revealed which gene groupings are closely connected to one another. We also assessed the fitness of those groupings and attempted to determine which distance-linkage combination produced the greatest results. We validated our findings using 8 different data sets. Furthermore, we used the best measure to identify common genes responsible for various tumors. Gene enrichment scores about their influence on various diseases were used to corroborate our findings.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>This section goes over our proposed methodology. First, we provided the proposed workflow for determining the optimum clustering distance-linkage approach. Then we went over several distance metrics, linkage methods, and our selection procedure for comparing the performance of various combinations. Finally, the pleiotropic gene observation methodology is discussed.</p>
      <sec>
        <title>Identifying the Best Distance-Linkage Method</title>
        <p>Our investigation begins with the import of a microarray data set into our procedure. This microarray data set is typically a 2D array, with rows representing different genes and columns representing their intensity at various time stamps. To minimize the dimensionality of the data set, we will use Principal Component Analysis. It is a sophisticated approach used by academics to remove irrelevant data from a data set while keeping its integrity.</p>
        <p>Then, in our data set, we run a distance metric. A distance measure, in general, calculates the similarity of 2 genes and determines how far apart they are. We employed the following 3 different distance metrics: Euclidean, Manhattan, and maximum. We chose a linkage method to connect related genes and generate a hierarchical tree after picking the distance metric. We used the following 4 linkage methods: single, complete, average, and ward linkage methods. We constructed a hierarchical tree using the distance-linkage method, where each leaf represents a gene, and the branches reflect the dissimilarity among them. The tree was then cut to various heights, resulting in several sets of genes for each cut point. Subsequently, we identified the appropriate cut point for that hierarchical tree by calculating how well those genes are clustered on different cut points. We used “Average Silhouette Width” and “Distance within Cluster” to calculate the fitness of the groups formed by different cut locations. The optimal fitness value is calculated using these fitness values. We determined the best combination of distance and linkage methods for a single data set by repeating this process with different combinations of distance and linkage methods. <xref rid="figure1" ref-type="fig">Figure 1</xref> depicts the algorithm.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Proposed algorithm for finding the best distance-linkage combination. Input: Microarray data set. Output: Distance-linkage combination.</p>
          </caption>
          <graphic xlink:href="bioinform_v3i1e30890_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>For a particular data set, D, optimal fitness value can be expressed by the following equations:</p>
        <p>
          <disp-formula>
            <graphic xlink:href="bioinform_v3i1e30890_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
        </p>
        <p>Where d <inline-graphic xlink:href="bioinform_v3i1e30890_fig17.png" xlink:type="simple" mimetype="image"/> distance methods and, I <inline-graphic xlink:href="bioinform_v3i1e30890_fig17.png" xlink:type="simple" mimetype="image"/> linkage methods.</p>
        <sec>
          <title>Used Distance Methods</title>
          <p>Euclidean distance uses Pythagorean formula to calculate the distance between 2 genes. For n dimensional space, we can write that formula as follows:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Unlike Euclidean distance, Manhattan distance takes the modulus value of the subtraction. For n-dimensional space, the equation of Manhattan Distance will be as follows:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Maximum distance, on the other hand, calculates the subtraction value for each column before selecting the highest number. The formula for n-dimensional space is as follows:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
        </sec>
        <sec>
          <title>Used Linkage Methods</title>
          <p>The single linkage approach connects 2 clusters by taking the shortest distance between them. The equation for the single linkage method to calculate the distance between any element and another element in another group is as follows:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Where p is an element in cluster P and q is an element of cluster Q.</p>
          <p>To compute the distance, the complete technique uses the farthest points in 2 clusters and connects the clusters with the shortest distance. The equation for the entire linking approach is as follows:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig9.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>The average method determines the average value for each gene inside the cluster, then connects them one by one on each layer to form a hierarchical tree. Equation 7 is the average linkage method update formula.</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig10.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Where m is all the instances of cluster a, and n is all the instances of cluster b.</p>
          <p>A centroid point is determined using Ward linkage (much like the centroid method). The squared distance value of each point in each cluster is then calculated using that centroid. It then sums all the squared distance values obtained by the 2 clusters together. It takes the smallest total value produced by a cluster pair and merges them on that level after repeating the same technique for every cluster on the same level. Equation 8 is the Ward linkage method update formula.</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
        </sec>
        <sec>
          <title>Metrics Used to Calculate Fitness</title>
          <p>The fitness of the clusters we acquired after cutting the hierarchical tree at a specific height was calculated using the following 2 metrics: average silhouette width (ASW) and distance inside cluster. The following formula is used to compute silhouette width:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Where a(i) is the average distance from object i and all the other points of the cluster in which i belongs; b(i) is the distance of the closest point in other cluster; and s(i) is the silhouette value between 2 clusters.</p>
          <p>ASW is the average of all the silhouette values. Generally, it varies from –1 to 1, and the value closer to 1 is considered better.</p>
          <p>The distance within a cluster is used to determine how close the elements are. Each cluster’s centroid is chosen during this process. The distance between each object in the cluster and the centroid is then determined as an average. This calculation’s formula is as follows:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig13.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Where dist(c,i) is the distance between centroid c and element i in a cluster; E is the set of elements in the cluster; and |E| is the number of elements in the cluster.</p>
          <p>From the characteristics, we can understand that ASW measures the quality of clusters. A greater ASW indicates good quality of clusters, that is, for a data set D, distance metric d and linkage method l,</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig14.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Where S<sub>i</sub> is the ASW for cut point i.</p>
          <p>However, distance within clusters measures how compact the data points are in the clusters. Therefore, better-quality clusters will have lower distance within clusters, that is,</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>Where W<sub>i</sub> is the distance within clusters for cut point i.</p>
          <p>Thus, to compare the quality of clusters we acquired at different cut points i in the hierarchical tree, our fitness function combines these 2 criteria. When these 2 relationships are combined, our fitness function becomes as follows:</p>
          <p>
            <disp-formula>
              <graphic xlink:href="bioinform_v3i1e30890_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
            </disp-formula>
          </p>
          <p>From this function, we can find out the optimal fitness for a specific combination of metrics in a certain data set.</p>
        </sec>
      </sec>
      <sec>
        <title>Cluster Ensemble</title>
        <p>We will try ensemble clustering [<xref ref-type="bibr" rid="ref26">26</xref>] to see if it works better once we have tried different clustering combinations. Three ensemble clustering techniques were employed, which are as follows: (1) similarity partitioning based on clusters; (2) hypergraph partitioning algorithm [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]; (3) meta-clustering algorithm.</p>
        <sec>
          <title>Cluster-Based Similarity Partitioning</title>
          <p>It starts by creating an n×n binary matrix in which the input is 1 if two objects belong to the same cluster and 0 otherwise. Every clustering approach is put through it. The final ensemble cluster is then generated using an entry-wise average of all clustering approaches.</p>
        </sec>
        <sec>
          <title>Hypergraph Partitioning Algorithm</title>
          <p>The data set is represented as a hypergraph by this algorithm. The hypergraph is then partitioned to determine the smallest number of edges. It produces the ensemble cluster based on the smallest number of edges.</p>
        </sec>
        <sec>
          <title>Metaclustering Algorithm</title>
          <p>The metaclustering algorithm starts by creating numerous clusters from a data set. The dissimilarity between those clusters is then calculated, and a metacluster is generated as a result of that measurement. In this approach, the ensemble is represented by the final metacluster.</p>
          <p>One of the most important characteristics of these algorithms is that the number of clusters that the algorithm will build must be declared at the start. For the specified data set, we used the cluster number created by the best distance-linkage combination.</p>
        </sec>
      </sec>
      <sec>
        <title>Observing Pleiotropy for Different Cancers</title>
        <p>We identified the genes responsible for various cancer tumors from the data sets and then evaluated their expression in different patients with cancer to report their various phenotypes in order to discover the pleiotropic behavior of distinct genes. We built a secondary data set by extracting the expression data for each gene from each data set after identifying the common genes across these disorders. Every primary data set must contain an equal number of time stamp values in order to build a 2D microarray data set. The data sets, however, have different numbers of columns. Central nervous system, for example, includes 60 time stamps for a single gene, but the ALL-AML (acute lymphoblastic leukemia-acute myeloid leukemia) data set has 72 time stamps. We cannot modify or remove any columns from the data set because doing so could compromise the data’s integrity or result in the loss of valuable information. To address this issue, we estimated the mean, median, standard deviation, and variance, which may be used to summarize numerical data [<xref ref-type="bibr" rid="ref30">30</xref>], and we used these numbers to construct our secondary data set. We will design a hierarchical tree using the perfect distance-linkage method found in the previously presented method because we have a data set for each gene with pleiotropic behavior. For that particular gene, the diseases that are closest to each other share similar summarized statistics. As a result, these trees will aid our understanding of how a single gene exhibits various phenotypes in patients with cancer. Furthermore, the gene enrichment scores of these common genes for the disorders that are frequent will be used to corroborate our findings.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Since no human or animal trial was conducted during this research, the authors did not apply for an ethical approval for the study.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>We will discuss the experimental outcomes we discovered in our research in this part. We started by explaining the data sets we used. The findings for various distance-linkage method combinations were then shown. We later presented our findings in terms of pleiotropy for the shared genes.</p>
      <sec>
        <title>Data Set</title>
        <p>We obtained gene expression data for various cancers from a publicly accessible database [<xref ref-type="bibr" rid="ref31">31</xref>]. Every data set includes the disease-causing genes as well as their expression in various patients with the same condition. We also examined a data set from a variety of disorders to confirm that our findings were disease-agnostic. We used 7 data sets for various cancers. <xref ref-type="table" rid="table2">Table 2</xref> lists the specifics of each data set.</p>
        <p>The number of genes and patients, or the number of conditions for each gene, differs among these data sets. We used a diverse data set to discover the ideal metrics, which can be used to any gene expression data set. Furthermore, these databases contain certain genes that are widely used. We have created a secondary data set to explore and analyze those genes further.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Description of data sets.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="210"/>
            <col width="310"/>
            <col width="240"/>
            <col width="240"/>
            <thead>
              <tr valign="top">
                <td>Data set</td>
                <td>Data domain</td>
                <td>Number of patients</td>
                <td>Number of genes</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>CNS<sup>a</sup></td>
                <td>Central nervous system</td>
                <td>60</td>
                <td>7129</td>
              </tr>
              <tr valign="top">
                <td>ALL-AML<sup>b</sup></td>
                <td>Acute lymphocytic leukemia</td>
                <td>72</td>
                <td>7129</td>
              </tr>
              <tr valign="top">
                <td>Lung cancer</td>
                <td>Lung cancer</td>
                <td>181</td>
                <td>12,533</td>
              </tr>
              <tr valign="top">
                <td>Ovarian cancer</td>
                <td>Ovarian cancer</td>
                <td>253</td>
                <td>15,154</td>
              </tr>
              <tr valign="top">
                <td>Lymphoma</td>
                <td>Lymphoma</td>
                <td>62</td>
                <td>4022</td>
              </tr>
              <tr valign="top">
                <td>SRBCT<sup>c</sup></td>
                <td>Small round blue cell tumor</td>
                <td>83</td>
                <td>2308</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>CNS: central nervous system.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>ALL-AML: acute lymphoblastic leukemia-acute myeloid leukemia.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>SRBCT: small round blue cell tumor.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Result of Experiments for Identifying the Best Distance-Linkage Method</title>
        <p>In our experiment, we employed several combinations of distance measurements and connection algorithms to generate a hierarchical tree. To validate our founding, we used 3 distance metrics and 4 linking methods. We combined these 3 distance metrics and 4 linkage methods to build 12 hierarchical trees for each data set. We cut each tree on numerous cut points after building hierarchical trees. As a result, the tree has been separated into several distinct groups. We assessed the fitness value for each cut point and selected the highest as the ideal value for that hierarchical tree given that particular distance metric-linkage method combination.</p>
        <p>A portion of a hierarchical tree of genes from the lung cancer data set is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>. This tree was constructed using the maximum-Ward combination. The full tree has a large number of leaves due to the data set’s 12,533 genes. All the values using Equation 13 are calculated, and the best values for each combination of distance method and linkage metric are shown in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Hierarchical tree created using the maximum-Ward method on lung cancer data set.</p>
          </caption>
          <graphic xlink:href="bioinform_v3i1e30890_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Fitness value for different combinations of distance and linkage metrics.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="250"/>
            <col width="240"/>
            <col width="240"/>
            <col width="240"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Data set and linkage</td>
                <td>Manhattan distance</td>
                <td>Euclidean distance</td>
                <td>Maximum distance</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>CNS<sup>a</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average</td>
                <td>3.80×10<sup>-13</sup></td>
                <td>9.47×10<sup>-12</sup></td>
                <td>3.50×10<sup>-11</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Complete</td>
                <td>1.42×10<sup>-13</sup></td>
                <td>6.78×10<sup>-12</sup></td>
                <td>3.44×10<sup>-12</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Single</td>
                <td>2.59×10<sup>-13</sup></td>
                <td>5.72×10<sup>-12</sup></td>
                <td>2.16×10<sup>-11</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ward</td>
                <td>4.49×10<sup>-14</sup></td>
                <td>3.22×10<sup>-13</sup></td>
                <td>3.09×10<sup>-12</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>ALL-AML<sup>b</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average</td>
                <td>1.20×10<sup>-6</sup></td>
                <td>1.45×10<sup>-5</sup></td>
                <td>3.39×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Complete</td>
                <td>8.89×10<sup>-7</sup></td>
                <td>2.11×10<sup>-5</sup></td>
                <td>1.51×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Single</td>
                <td>1.11×10<sup>-6</sup></td>
                <td>1.37×10<sup>-5</sup></td>
                <td>1.24×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ward</td>
                <td>4.41×10<sup>-7</sup></td>
                <td>2.64×10<sup>-6</sup></td>
                <td>3.07×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Lung cancer</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average</td>
                <td>5.56×10<sup>-8</sup></td>
                <td>1.48×10<sup>-6</sup></td>
                <td>3.36×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Complete</td>
                <td>5.35×10<sup>-8</sup></td>
                <td>1.23×10<sup>-6</sup></td>
                <td>1.52×10<sup>-9</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Single</td>
                <td>5.33×10<sup>-8</sup></td>
                <td>6.47×10<sup>-7</sup></td>
                <td>5.86×10<sup>-7</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ward</td>
                <td>3.03×10<sup>-8</sup></td>
                <td>1.19×10<sup>-6</sup></td>
                <td>6.71×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Ovarian</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average</td>
                <td>1.25×10<sup>-5</sup></td>
                <td>1.59×10<sup>-4</sup></td>
                <td>2.87×10<sup>-4</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Complete</td>
                <td>1.71×10<sup>-5</sup></td>
                <td>7.49×10<sup>-5</sup></td>
                <td>6.28×10<sup>-4</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Single</td>
                <td>2.88×10<sup>-6</sup></td>
                <td>3.12×10<sup>-4</sup></td>
                <td>1.28×10<sup>-4</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ward</td>
                <td>2.49×10<sup>-4</sup></td>
                <td>3.44×10<sup>-5</sup></td>
                <td>9.31×10<sup>-4</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>Lymphoma</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average</td>
                <td>1.29×10<sup>-7</sup></td>
                <td>2.81×10<sup>-6</sup></td>
                <td>9.66×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Complete</td>
                <td>2.21×10<sup>-8</sup></td>
                <td>2.34×10<sup>-6</sup></td>
                <td>6.00×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Single</td>
                <td>1.01×10<sup>-7</sup></td>
                <td>2.81×10<sup>-6</sup></td>
                <td>8.10×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ward</td>
                <td>1.23×10<sup>-8</sup></td>
                <td>6.05×10<sup>-7</sup></td>
                <td>2.82×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>SRBCT<sup>c</sup></bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average</td>
                <td>1.52×10<sup>-7</sup></td>
                <td>6.73×10<sup>-6</sup></td>
                <td>4.41×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Complete</td>
                <td>1.03×10<sup>-7</sup></td>
                <td>4.72×10<sup>-6</sup></td>
                <td>3.73×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Single</td>
                <td>8.24×10<sup>-8</sup></td>
                <td>4.34×10<sup>-6</sup></td>
                <td>3.00×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Ward</td>
                <td>3.88×10<sup>-9</sup></td>
                <td>8.55×10<sup>-8</sup></td>
                <td>2.67×10<sup>-6</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>CNS: Central Nervous System.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>ALL-AML: acute lymphoblastic leukemia-acute myeloid leukemia.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>SRBCT: small round blue cell tumor.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Ensemble Result</title>
        <p>We chose the data set (ALL-AML) for testing and ran these 4 ensemble clustering techniques. For this data set, the maximum-average combination produced the best result, with a cluster number of 135. <xref ref-type="table" rid="table4">Table 4</xref> displays the fitness values. We discovered that no ensemble clustering approach improves fitness value in any way.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Fitness value for different ensemble techniques.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Ensemble techniques</td>
                <td>Fitness value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>CSPA<sup>a</sup></td>
                <td>4.32×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td>HGPA<sup>b</sup></td>
                <td>3.29×10<sup>-6</sup></td>
              </tr>
              <tr valign="top">
                <td>MCLA<sup>c</sup></td>
                <td>1.53×10<sup>-5</sup></td>
              </tr>
              <tr valign="top">
                <td>Maximum-average</td>
                <td>3.39×10<sup>-5</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>CSPA: cluster-based similarity partitioning.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>HGPA: hyper graph partitioning algorithm.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>MCLA: metaclustering algorithm.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Result Analysis for Common Genes</title>
        <p>Multiple tumors can be caused by a small number of genes. We discovered 9 genes linked to the following 3 types of cancer: central nervous system, lymphoma, and lung cancer. AFFX-TrpnX-5 at, AFFX-ThrX-5 at, AFFX-ThrX-3 at, AFFX-PheX-M at, AFFX-PheX-5 at, AFFX-PheX-3 at, AFFX-LysX-M at, AFFX-LysX-3 at, and AFFX-LysX-5 at were discovered to be common genes. We found the gene enrichment score publicly available at [<xref ref-type="bibr" rid="ref32">32</xref>] to confirm our findings. Gene enrichment scores in various malignancies are given in <xref rid="figure3" ref-type="fig">Figure 3</xref> for the discovered common genes.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Gene enrichment score vs cancer type.</p>
          </caption>
          <graphic xlink:href="bioinform_v3i1e30890_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The maximum distance method combined with the average linkage method produces better hierarchical trees in 4 data sets (central nervous system, leukemia, lymphoma, and SRBCT), according to the fitness values provided in <xref ref-type="table" rid="table3">Table 3</xref>. These data sets are medium in size, with 60-80 rows and 2000-7000 columns, as shown in <xref ref-type="table" rid="table2">Table 2</xref>. In the Spellmen data set, however, the maximum-average combination also excels. The other 4 data sets reflect human genes that are responsible for specific tumors, whereas Spellmen is a microarray data set of bacteria. However, the maximum distance approach with ward linkage method constructs a superior hierarchical tree compared with the other methods in 2 of the largest data sets, lung and ovarian. These 2 data sets are larger than the others, and they share no genes with the others.</p>
        <p>The maximum distance metric outperforms the other 2 distance methods among the 3 most commonly used distance metrics. Maximum distance considers only 1 column where those 2 genes have the most variance when calculating distance between them. The Euclidean and Manhattan distance methods, on the other hand, would have taken distances across all columns. As a result, the dissimilarity values for the Euclidean and Manhattan distances are approaching the maximum distance. As a result, in clustering, the Euclidean and Manhattan distances place points slightly farther apart than the Maximum distance. Furthermore, because all the columns indicate the same features of a gene evaluated at different time stamps, we can analyze the worst scenario (ie, the greatest differential in the expression of 2 genes at a certain moment). This is the most significant difference between these 2 genes. To put it another way, maximum distance calculates only the difference that matters. The Euclidean and Manhattan distances, on the other hand, are becoming buried in the massive amount of data. The maximum distance, on the other hand, may create undesirable clusters in a different data set with uniform variation across all columns.</p>
        <p>When the data set is small, the average linkage approach performs well, and when the data set is huge, the ward method performs well. The single linkage approach may be faster than the average method for joining clusters, but it is not necessarily better. When determining the proximity of 2 clusters, it always considers only 2 points and ignores all others. The average linkage approach, on the other hand, considers all the points in the cluster when determining relatedness. When using the ward technique, the sum square error is used to determine similarity. When working with small or medium-sized data sets, the average linkage approach outperforms the ward linkage method, but as the data sets grow larger, the sum square error values take over and produce superior results compared with the average linkage method.</p>
        <p>We tried to identify the optimal combination in our research and found that the maximum distance method performs better on hierarchical clustering when column variance is not uniform across the data set. However, if the data set is medium in size, with around 2000-7000 rows and 60-80 columns, the average linkage technique will outperform other linkage methods, and if the data set is very large, with 12,000-15,000 rows and 100-200 columns, the ward linkage approach will outperform other linkage methods. Furthermore, it has been discovered that ensemble clustering can improve performance by a very little amount at the cost of extra work.</p>
        <p>We discovered 9 common genes that cause the following 3 diseases: lymphoma, central nervous system cancer, and lung cancer. We tried to figure out how these genes play a role in these 3 diseases using the data provided in the data sets. The maximum-average hierarchical clustering technique was chosen since it performed the best in the first experiment. We used gene enrichment score to confirm our findings on whether the 9 genes discovered have an impact on these 3 conditions. <xref rid="figure3" ref-type="fig">Figure 3</xref> shows the gene enrichment scores for these genes. We can see that 8 of the 9 genes are important for all 3 cancers. Only 1 gene (AFFX-PheX-3 at) is more important than the other 2 in lung cancer. However, it is clear that our discovered genes have a significant impact on these 3 cancer forms.</p>
        <p>Bioinformatics is becoming more and more involved in health sectors, such as disease detection and individualized medicine recommendation, as computational technology advances. Clustering techniques are becoming increasingly important in these industries. We investigated several distance-linkage combinations and attempted to find a solution. We hope that other researchers who use hierarchical clustering will profit from our findings and apply what they have learned to their own study. We also discovered common genes with multiple symptoms, which we confirmed using gene enrichment profiling. Knowing the pleiotropic nature of these genes will help scientists work on them to combat cancer.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>In this study, we discovered a set of measures that will yield higher-quality clusters for gene expression data. Pleiotropic behavior of common genes for many disorders was also discovered. To validate our findings, we used a variety of data sets that varied in size and richness. We used a fitness function to compare cluster quality between sets of clusters while assessing cluster quality. For medium-sized data sets, we discovered that the maximum distance metric combined with average linkage works best. Ward linkage also works better with huge data sets. Furthermore, due to data dimension differences, we had to preprocess data while identifying common genes for various disorders. It is critical to identify genes with similar symptoms more precisely and to separate those genes more effectively. Furthermore, detecting a gene by applying the clustering technique to find comparable genes is a critical work for researchers, and if done correctly, might save countless lives. For all these reasons, correct clustering is becoming increasingly important in bioinformatics. Therefore, if their data set resembles our microarray data, researchers from other fields can employ this technology.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ALL-AML</term>
          <def>
            <p>acute lymphoblastic leukemia-acute myeloid leukemia</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">ASW</term>
          <def>
            <p>average silhouette width</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Richter</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Ott</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Auch</surname>
              <given-names>AF</given-names>
            </name>
            <name name-style="western">
              <surname>Schmid</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Huson</surname>
              <given-names>DH</given-names>
            </name>
          </person-group>
          <article-title>MetaSim: a sequencing simulator for genomics and metagenomics</article-title>
          <source>Handbook of Molecular Microbial Ecology I: Metagenomics and Complementary Approaches</source>
          <year>2011</year>
          <month>05</month>
          <day>03</day>
          <publisher-loc>Hoboken, NJ</publisher-loc>
          <publisher-name>Wiley</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>White</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>DW</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Bioinformatics strategies for proteomic profiling</article-title>
          <source>Clin Biochem</source>
          <year>2004</year>
          <month>07</month>
          <volume>37</volume>
          <issue>7</issue>
          <fpage>636</fpage>
          <lpage>41</lpage>
          <pub-id pub-id-type="doi">10.1016/j.clinbiochem.2004.05.004</pub-id>
          <pub-id pub-id-type="medline">15234244</pub-id>
          <pub-id pub-id-type="pii">S0009912004001262</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smyth</surname>
              <given-names>GK</given-names>
            </name>
          </person-group>
          <article-title>Limma: linear models for microarray data</article-title>
          <source>Bioinformatics and Computational Biology Solutions Using R and Bioconductor</source>
          <year>2005</year>
          <publisher-loc>New York, US</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>397</fpage>
          <lpage>420</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Letunic</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Bork</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Interactive Tree Of Life v2: online annotation and display of phylogenetic trees made easy</article-title>
          <source>Nucleic Acids Res</source>
          <year>2011</year>
          <month>07</month>
          <day>05</day>
          <volume>39</volume>
          <issue>Web Server issue</issue>
          <fpage>W475</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21470960"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/nar/gkr201</pub-id>
          <pub-id pub-id-type="medline">21470960</pub-id>
          <pub-id pub-id-type="pii">gkr201</pub-id>
          <pub-id pub-id-type="pmcid">PMC3125724</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Godini</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Fallahi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A brief overview of the concepts, methods and computational tools used in phylogenetic tree construction and gene prediction</article-title>
          <source>Meta Gene</source>
          <year>2019</year>
          <month>09</month>
          <volume>21</volume>
          <fpage>100586</fpage>
          <pub-id pub-id-type="doi">10.1016/j.mgene.2019.100586</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>GW</given-names>
            </name>
            <name name-style="western">
              <surname>Prinz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Neou</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shelby</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Marzolf</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Thorsson</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Galitski</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Prediction of phenotype and gene expression for combinations of mutations</article-title>
          <source>Mol Syst Biol</source>
          <year>2007</year>
          <month>03</month>
          <day>27</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>96</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/msb4100137"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/msb4100137</pub-id>
          <pub-id pub-id-type="medline">17389876</pub-id>
          <pub-id pub-id-type="pii">msb4100137</pub-id>
          <pub-id pub-id-type="pmcid">PMC1847951</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alberts</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Raff</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Walter</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Studying gene expression and function</article-title>
          <source>Molecular Biology of the Cell, 4th edition</source>
          <year>2002</year>
          <publisher-loc>New York, US</publisher-loc>
          <publisher-name>Garland Science</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Letunic</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Bork</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Interactive Tree Of Life (iTOL): an online tool for phylogenetic tree display and annotation</article-title>
          <source>Bioinformatics</source>
          <year>2007</year>
          <month>01</month>
          <day>01</day>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>127</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btl529</pub-id>
          <pub-id pub-id-type="medline">17050570</pub-id>
          <pub-id pub-id-type="pii">btl529</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ashraf</surname>
              <given-names>FB</given-names>
            </name>
            <name name-style="western">
              <surname>Ajwad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mottalib</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>A novel gene-tree based approach to infer relations among disease-genes across different cancer types</article-title>
          <year>2019</year>
          <conf-name>International Conference on Electrical, Computer and Communication Engineering (ECCE)</conf-name>
          <conf-date>February 07-09, 2019</conf-date>
          <conf-loc>Cox'sBazar, Bangladesh</conf-loc>
          <pub-id pub-id-type="doi">10.1109/ecace.2019.8678921</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Medico</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>FLAME, a novel fuzzy clustering method for the analysis of DNA microarray data</article-title>
          <source>BMC Bioinformatics</source>
          <year>2007</year>
          <month>01</month>
          <day>04</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2105-8-3</pub-id>
          <pub-id pub-id-type="medline">17204155</pub-id>
          <pub-id pub-id-type="pii">1471-2105-8-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC1774579</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Murty</surname>
              <given-names>MN</given-names>
            </name>
            <name name-style="western">
              <surname>Flynn</surname>
              <given-names>PJ</given-names>
            </name>
          </person-group>
          <article-title>Data clustering</article-title>
          <source>ACM Comput. Surv</source>
          <year>1999</year>
          <month>09</month>
          <volume>31</volume>
          <issue>3</issue>
          <fpage>264</fpage>
          <lpage>323</lpage>
          <pub-id pub-id-type="doi">10.1145/331499.331504</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lord</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Diallo</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Makarenkov</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Classification of bioinformatics workflows using weighted versions of partitioning and hierarchical clustering algorithms</article-title>
          <source>BMC Bioinformatics</source>
          <year>2015</year>
          <month>03</month>
          <day>03</day>
          <volume>16</volume>
          <issue>1</issue>
          <fpage>68</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0508-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12859-015-0508-1</pub-id>
          <pub-id pub-id-type="medline">25887434</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12859-015-0508-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC4354763</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Greiff</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Bhat</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Menzel</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>ST</given-names>
            </name>
          </person-group>
          <article-title>A bioinformatic framework for immune repertoire diversity profiling enables detection of immunological status</article-title>
          <source>Genome Med</source>
          <year>2015</year>
          <month>5</month>
          <day>28</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>49</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-015-0169-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13073-015-0169-8</pub-id>
          <pub-id pub-id-type="medline">26140055</pub-id>
          <pub-id pub-id-type="pii">169</pub-id>
          <pub-id pub-id-type="pmcid">PMC4489130</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>Wang</collab>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Xiao-Feng</given-names>
            </name>
            <name name-style="western">
              <surname>Shu</surname>
              <given-names>Yong-Qian</given-names>
            </name>
          </person-group>
          <article-title>Prediction of non-small cell lung cancer metastasis-associated microRNAs using bioinformatics</article-title>
          <source>Am J Cancer Res</source>
          <year>2015</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>32</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25628919"/>
          </comment>
          <pub-id pub-id-type="medline">25628919</pub-id>
          <pub-id pub-id-type="pmcid">PMC4300719</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>NT</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Briggs</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Timberlake</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Vigneault</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kleinstein</surname>
              <given-names>SH</given-names>
            </name>
          </person-group>
          <article-title>Hierarchical Clustering Can Identify B Cell Clones with High Confidence in Ig Repertoire Sequencing Data</article-title>
          <source>J Immunol</source>
          <year>2017</year>
          <month>03</month>
          <day>15</day>
          <volume>198</volume>
          <issue>6</issue>
          <fpage>2489</fpage>
          <lpage>2499</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jimmunol.org/cgi/pmidlookup?view=long&amp;pmid=28179494"/>
          </comment>
          <pub-id pub-id-type="doi">10.4049/jimmunol.1601850</pub-id>
          <pub-id pub-id-type="medline">28179494</pub-id>
          <pub-id pub-id-type="pii">jimmunol.1601850</pub-id>
          <pub-id pub-id-type="pmcid">PMC5340603</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Identification of cell types from single-cell transcriptomes using a novel clustering method</article-title>
          <source>Bioinformatics</source>
          <year>2015</year>
          <month>06</month>
          <day>15</day>
          <volume>31</volume>
          <issue>12</issue>
          <fpage>1974</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25805722"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/bioinformatics/btv088</pub-id>
          <pub-id pub-id-type="medline">25805722</pub-id>
          <pub-id pub-id-type="pii">btv088</pub-id>
          <pub-id pub-id-type="pmcid">PMC6280782</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Danielsson</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Euclidean distance mapping</article-title>
          <source>Computer Graphics and Image Processing</source>
          <year>1980</year>
          <month>11</month>
          <volume>14</volume>
          <issue>3</issue>
          <fpage>227</fpage>
          <lpage>248</lpage>
          <pub-id pub-id-type="doi">10.1016/0146-664x(80)90054-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klove</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tzeng</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Permutation Arrays Under the Chebyshev Distance</article-title>
          <source>IEEE Trans. Inform. Theory</source>
          <year>2010</year>
          <month>06</month>
          <volume>56</volume>
          <issue>6</issue>
          <fpage>2611</fpage>
          <lpage>2617</lpage>
          <pub-id pub-id-type="doi">10.1109/tit.2010.2046212</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Everitt</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Landau</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Leese</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stahl</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <source>Cluster Analysis, 5th Edition</source>
          <year>2011</year>
          <publisher-loc>New York, US</publisher-loc>
          <publisher-name>John Wiley &amp; Son</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pop</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Salzberg</surname>
              <given-names>SL</given-names>
            </name>
          </person-group>
          <article-title>Bioinformatics challenges of new sequencing technology</article-title>
          <source>Trends Genet</source>
          <year>2008</year>
          <month>03</month>
          <volume>24</volume>
          <issue>3</issue>
          <fpage>142</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18262676"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.tig.2007.12.006</pub-id>
          <pub-id pub-id-type="medline">18262676</pub-id>
          <pub-id pub-id-type="pii">S0168-9525(08)00022-X</pub-id>
          <pub-id pub-id-type="pmcid">PMC2680276</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paaby</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Rockman</surname>
              <given-names>MV</given-names>
            </name>
          </person-group>
          <article-title>The many faces of pleiotropy</article-title>
          <source>Trends Genet</source>
          <year>2013</year>
          <month>02</month>
          <volume>29</volume>
          <issue>2</issue>
          <fpage>66</fpage>
          <lpage>73</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/23140989"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.tig.2012.10.010</pub-id>
          <pub-id pub-id-type="medline">23140989</pub-id>
          <pub-id pub-id-type="pii">S0168-9525(12)00169-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC3558540</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Carta</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Galatioto</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ramirez</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Cardiovascular manifestations in Marfan syndrome and related diseases; multiple genes causing similar phenotypes</article-title>
          <source>Clin Genet</source>
          <year>2015</year>
          <volume>87</volume>
          <issue>1</issue>
          <fpage>11</fpage>
          <lpage>20</lpage>
          <pub-id pub-id-type="doi">10.1111/cge.12436</pub-id>
          <pub-id pub-id-type="medline">24867163</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McClellan</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Susser</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Schizophrenia: a common disease caused by multiple rare alleles</article-title>
          <source>Br J Psychiatry</source>
          <year>2007</year>
          <month>03</month>
          <day>02</day>
          <volume>190</volume>
          <issue>3</issue>
          <fpage>194</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1192/bjp.bp.106.025585</pub-id>
          <pub-id pub-id-type="medline">17329737</pub-id>
          <pub-id pub-id-type="pii">S0007125000171363</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Altshuler</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Daly</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lander</surname>
              <given-names>ES</given-names>
            </name>
          </person-group>
          <article-title>Genetic mapping in human disease</article-title>
          <source>Science</source>
          <year>2008</year>
          <month>11</month>
          <day>07</day>
          <volume>322</volume>
          <issue>5903</issue>
          <fpage>881</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/18988837"/>
          </comment>
          <pub-id pub-id-type="doi">10.1126/science.1156409</pub-id>
          <pub-id pub-id-type="medline">18988837</pub-id>
          <pub-id pub-id-type="pii">322/5903/881</pub-id>
          <pub-id pub-id-type="pmcid">PMC2694957</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davidsohn</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Pezone</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vernet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Graveline</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Oliver</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Slomovic</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Punthambaker</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bonventre</surname>
              <given-names>JV</given-names>
            </name>
            <name name-style="western">
              <surname>Church</surname>
              <given-names>GM</given-names>
            </name>
          </person-group>
          <article-title>A single combination gene therapy treats multiple age-related diseases</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2019</year>
          <month>11</month>
          <day>19</day>
          <volume>116</volume>
          <issue>47</issue>
          <fpage>23505</fpage>
          <lpage>23511</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pnas.org/doi/abs/10.1073/pnas.1910073116?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.1910073116</pub-id>
          <pub-id pub-id-type="medline">31685628</pub-id>
          <pub-id pub-id-type="pii">1910073116</pub-id>
          <pub-id pub-id-type="pmcid">PMC6876218</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vega-pons</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ruiz-Shulcloper</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A survey of clustering ensemble algorithms</article-title>
          <source>Int. J. Patt. Recogn. Artif. Intell</source>
          <year>2011</year>
          <month>11</month>
          <day>21</day>
          <volume>25</volume>
          <issue>03</issue>
          <fpage>337</fpage>
          <lpage>372</lpage>
          <pub-id pub-id-type="doi">10.1142/s0218001411008683</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Strehl</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Cluster Ensembles – A Knowledge Reuse Framework for Combining Multiple Partitions</article-title>
          <source>Journal of machine learning research</source>
          <year>2002</year>
          <month>02</month>
          <day>12</day>
          <volume>3</volume>
          <fpage>583</fpage>
          <lpage>617</lpage>
          <pub-id pub-id-type="doi">10.1002/widm.32</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fern</surname>
              <given-names>XZ</given-names>
            </name>
            <name name-style="western">
              <surname>Brodley</surname>
              <given-names>CE</given-names>
            </name>
          </person-group>
          <article-title>Solving cluster ensemble problems by bipartite graph partitioning</article-title>
          <year>2004</year>
          <month>07</month>
          <day>04</day>
          <conf-name>Proceedings of the twenty-first international conference on Machine learning</conf-name>
          <conf-date>July 4-8, 2004</conf-date>
          <conf-loc>Banff, AB, Canada</conf-loc>
          <pub-id pub-id-type="doi">10.1145/1015330.1015414</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Caruana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Elhawary</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Meta clustering</article-title>
          <year>2006</year>
          <conf-name>Sixth International Conference on Data Mining (ICDM'06)</conf-name>
          <conf-date>December 18-22, 2006</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icdm.2006.103</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rees</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Summarizing data by numerical measures</article-title>
          <source>Essential statistics</source>
          <year>1989</year>
          <publisher-loc>Boston, USA</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>24</fpage>
          <lpage>38</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dash</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Markov blanket-embedded genetic algorithm for gene selection</article-title>
          <source>Pattern Recognition</source>
          <year>2007</year>
          <month>11</month>
          <volume>40</volume>
          <issue>11</issue>
          <fpage>3236</fpage>
          <lpage>3248</lpage>
          <pub-id pub-id-type="doi">10.1016/j.patcog.2007.02.007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <article-title>Gene enrichment Profiler</article-title>
          <source>Center for Computational and Integrative Biology</source>
          <access-date>2022-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://xavierlab2.mgh.harvard.edu/EnrichmentProfiler/help.html">http://xavierlab2.mgh.harvard.edu/EnrichmentProfiler/help.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
