<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Bioinform Biotech</journal-id><journal-id journal-id-type="publisher-id">bioinform</journal-id><journal-id journal-id-type="index">19</journal-id><journal-title>JMIR Bioinformatics and Biotechnology</journal-title><abbrev-journal-title>JMIR Bioinform Biotech</abbrev-journal-title><issn pub-type="epub">2563-3570</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v6i1e70706</article-id><article-id pub-id-type="doi">10.2196/70706</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Extracting Knowledge From Scientific Texts on Patient-Derived Cancer Models Using Large Language Models: Algorithm Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Yao</surname><given-names>Jiarui</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Perova</surname><given-names>Zinaida</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mandloi</surname><given-names>Tushar</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lewis</surname><given-names>Elizabeth</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Parkinson</surname><given-names>Helen</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Savova</surname><given-names>Guergana</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Computational Health Informatics Program, Boston Children's Hospital</institution><addr-line>401 Park Drive</addr-line><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff3"><institution>European Molecular Biology Laboratory, European Bioinformatics Institute</institution><addr-line>Hinxton</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Finkelstein</surname><given-names>Joseph</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Dadheech</surname><given-names>Pankaj</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Eger</surname><given-names>Steffen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Ziru</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jiarui Yao, PhD, Computational Health Informatics Program, Boston Children's Hospital, 401 Park Drive, Boston, MA, United States, 1 7813545014; <email>jiarui.yao@childrens.harvard.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>30</day><month>6</month><year>2025</year></pub-date><volume>6</volume><elocation-id>e70706</elocation-id><history><date date-type="received"><day>30</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>14</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>27</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jiarui Yao, Zinaida Perova, Tushar Mandloi, Elizabeth Lewis, Helen Parkinson, Guergana Savova. Originally published in JMIR Bioinformatics and Biotechnology (<ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org">https://bioinform.jmir.org</ext-link>), 30.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">http://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Bioinformatics and Biotechnology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org/">https://bioinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://bioinform.jmir.org/2025/1/e70706"/><abstract><sec><title>Background</title><p>Patient-derived cancer models (PDCMs) have become essential tools in cancer research and preclinical studies. Consequently, the number of publications on PDCMs has increased significantly over the past decade. Advances in artificial intelligence, particularly in large language models (LLMs), offer promising solutions for extracting knowledge from scientific literature at scale.</p></sec><sec><title>Objective</title><p>This study aims to investigate LLM-based systems, focusing specifically on prompting techniques for the automated extraction of PDCM-related entities from scientific texts.</p></sec><sec sec-type="methods"><title>Methods</title><p>We explore 2 LLM-prompting approaches. The classic method, direct prompting, involves manually designing a prompt. Our direct prompt consists of an instruction, entity-type definitions, gold examples, and a query. In addition, we experiment with a novel and underexplored prompting strategy&#x2014;soft prompting. Unlike direct prompting, soft prompts are trainable continuous vectors that learn from provided data. We evaluate both prompting approaches across state-of-the-art proprietary and open LLMs.</p></sec><sec sec-type="results"><title>Results</title><p>We manually annotated 100 abstracts of PDCM-relevant papers, focusing on PDCM papers with data deposited in the CancerModels.Org platform. The resulting gold annotations span 15 entity types for a total 3313 entity mentions, which we split across training (2089 entities), development (542 entities) and held-out, eye-off test (682 entities) sets. Evaluation includes the standard metrics of precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall) in 2 settings: an exact match setting, where spans of gold and predicted annotations have to match exactly, and an overlapping match setting, where the spans of gold and predicted annotations have to overlap. GPT4-o with direct prompting achieved <italic>F</italic><sub>1</sub>-scores of 50.48 and 71.36 for exact and overlapping match settings, respectively. In both evaluation settings, LLaMA3 soft prompting improved performance over direct prompting (<italic>F</italic><sub>1</sub>-score from 7.06 to 46.68 in the exact match setting; and 12.0 to 71.80 in the overlapping evaluation setting). Results with LLaMA3 soft prompting are slightly higher than GPT4-o direct prompting in the overlapping match evaluation setting.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>We investigated LLM-prompting techniques for the automatic extraction of PDCM-relevant entities from scientific texts, comparing the traditional direct prompting approach with the emerging soft prompting method. In our experiments, GPT4-o demonstrated strong performance with direct prompting, maintaining competitive results. Meanwhile, soft prompting significantly enhanced the performance of smaller open LLMs. Our findings suggest that training soft prompts on smaller open models can achieve performance levels comparable to those of proprietary very large language models.</p></sec></abstract><kwd-group><kwd>patient-derived cancer models</kwd><kwd>large language models</kwd><kwd>knowledge extraction</kwd><kwd>in-context learning</kwd><kwd>soft prompting</kwd><kwd>prompt tuning</kwd><kwd>information extraction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Patient-derived cancer models (PDCMs) are created from a patient&#x2019;s own tumor sample and capture the complexity of human tumors to enable more accurate, personalized drug development and treatment selection. These models, including patient-derived xenografts (PDXs), organoids, and cell lines, allow researchers to test treatments and identify the most effective therapies, and have emerged as indispensable tools in both cancer research and precision medicine. The US National Institutes of Health (NIH) have made significant investments in the generation and characterization of these models, with more than US $3 billion dedicated to active grants referencing PDCMs with a component of their research based on data extracted from the NIH RePORTER [<xref ref-type="bibr" rid="ref1">1</xref>] for fiscal year 2024 alone. The number of publications using PDCMs continues to increase generating substantial and rich metadata and data that require standardization, harmonization, and integration to maximize the impact of these models and their associated data within the research and clinical communities. CancerModels.Org platform [<xref ref-type="bibr" rid="ref2">2</xref>] serves as a unified gateway to the largest collection of PDCMs and related data. It empowers researchers and clinicians to discover suitable models for testing research hypotheses, conducting large-scale drug screenings, and advancing precision medicine initiatives. Extraction of PDCM-relevant knowledge and its harmonization within CancerModels.Org is essential to ensure that basic and translational researchers, bioinformaticians, and tool developers have access to PDCM knowledge. While manual curation of publications ensures high accuracy when performed by domain experts, it is time-consuming and labor-intensive. Thus, a more streamlined and efficient knowledge acquisition method is needed to address the growing demand within the scientific community for the timely availability of the PDCM metadata and its associated data.</p><p>In parallel, large language models (LLMs) [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>] often referred to as generative artificial intelligence (AI) systems are trained on vast amounts of data and have demonstrated impressive capabilities in the health care domain [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Researchers have studied the use of LLMs in addressing various tasks related to health care such as diagnosing conditions [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>], clinical decision support [<xref ref-type="bibr" rid="ref11">11</xref>], answering patient questions [<xref ref-type="bibr" rid="ref12">12</xref>], and medical education [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. It has been shown that LLMs can extract meaningful information from texts [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>In this work, we explore LLM-prompting techniques with the goal of extracting knowledge from PDCM-relevant scholarly publications. We focus on the classic direct prompting [<xref ref-type="bibr" rid="ref4">4</xref>] and the underexplored soft prompting [<xref ref-type="bibr" rid="ref18">18</xref>] with state-of-the-art (SOTA) proprietary and open LLMs. Our experimental results provide insights into selecting the optimal prompting methods for specific tasks. The contributions of this paper are:</p><list list-type="order"><list-item><p>Studying the feasibility of SOTA LLMs as oncology knowledge extractors for PDCM-relevant information from scholarly scientific literature.</p></list-item><list-item><p>Creating a manually curated gold dataset spanning 15 entity types for a total 3313 entity mentions from 100 abstracts of PDCM-relevant papers.</p></list-item><list-item><p>Researching and comparing, to our knowledge for the first time, direct versus soft prompting techniques for oncology knowledge extraction, specifically PDCM-relevant information from scholarly scientific literature.</p></list-item></list></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Concepts</title><p>We define &#x201C;knowledge&#x201D; as entities of interest to researchers working with PDCMs in the cancer research field. For example, the patient&#x2019;s diagnosis provides a reference point to confirm that a PDCM faithfully recapitulates the biology of the original tumor and is essential for ensuring the model&#x2019;s relevance and reliability in studies of cancer progression or treatment response. Thus, &#x201C;diagnosis&#x201D; is important to understand the model&#x2019;s characteristics in the context of patient&#x2019;s disease. The patient&#x2019;s age can significantly affect the molecular and genetic characteristics of the tumor. For example, pediatric cancers often have distinct genetic drivers and tumor microenvironments compared to cancers in older adults. In addition, age-related biological factors, such as immune system, metabolism, and hormone levels, influence how a tumor responds to treatments. Thus, knowing the patient&#x2019;s age is imperative for predictive accuracy of the model in preclinical testing and relevance of research findings. Therefore, we selected 15 most commonly used CancerModels.Org data model attributes (<xref ref-type="table" rid="table1">Table 1</xref>), which include the attributes defined in the minimal information standard for patient-derived xenograft models [<xref ref-type="bibr" rid="ref19">19</xref>] and the draft minimal information standard for in vitro models [<xref ref-type="bibr" rid="ref20">20</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Entity definitions based on the CancerModels.Org data model with examples and interannotator agreement <italic>F</italic><sub>1</sub>-scores in the exact match setting that requires the spans of the annotators to match exactly.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Entity type</td><td align="left" valign="bottom">Definition</td><td align="left" valign="bottom">Example</td><td align="left" valign="bottom">IAA<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">diagnosis</td><td align="left" valign="top">Diagnosis at the time of collection of the patient tumor used in the cancer model</td><td align="left" valign="top">TNBC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">61.67</td></tr><tr><td align="left" valign="top">age_category</td><td align="left" valign="top">Age category of the patient at the time of tissue sampling</td><td align="left" valign="top">Adult, pediatric</td><td align="left" valign="top">60</td></tr><tr><td align="left" valign="top">genetic_effect</td><td align="left" valign="top">Any form of chromosomal rearrangement or gene-level changes</td><td align="left" valign="top">Missense, amplification</td><td align="left" valign="top">57.67</td></tr><tr><td align="left" valign="top">model_type</td><td align="left" valign="top">Type of patient-derived model</td><td align="left" valign="top">PDX<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>, organoid</td><td align="left" valign="top">53.33</td></tr><tr><td align="left" valign="top">molecular_char</td><td align="left" valign="top">Data or assay generated from or performed on the model in this study</td><td align="left" valign="top">RNA sequencing, whole-exome sequencing</td><td align="left" valign="top">54.33</td></tr><tr><td align="left" valign="top">biomarker</td><td align="left" valign="top">Gene, protein or biological molecule identified in or associated with patient&#x2019;s/model&#x2019;s tumor</td><td align="left" valign="top">BRCA1<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup>, IDH<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup>, epidermal growth factor receptor 2</td><td align="left" valign="top">61.33</td></tr><tr><td align="left" valign="top">treatment</td><td align="left" valign="top">Treatment received by the patient or tested on the model</td><td align="left" valign="top">Surgery, chemotherapy, PARP-inhibitor</td><td align="left" valign="top">55.67</td></tr><tr><td align="left" valign="top">response_to_treatment</td><td align="left" valign="top">Effect of the treatment on the patient&#x2019;s tumor or model</td><td align="left" valign="top">Progression-free survival, reduced tumor growth</td><td align="left" valign="top">55</td></tr><tr><td align="left" valign="top">sample_type</td><td align="left" valign="top">The type of material used to generate the model or how this material was obtained</td><td align="left" valign="top">Tissue fragment, autopsy</td><td align="left" valign="top">49</td></tr><tr><td align="left" valign="top">tumor_type</td><td align="left" valign="top">Collected tumor type used for generating the model</td><td align="left" valign="top">Primary, recurrent</td><td align="left" valign="top">49.67</td></tr><tr><td align="left" valign="top">cancer_grade</td><td align="left" valign="top">Quantitative or qualitative grade reflecting how quickly the cancer is likely to grow</td><td align="left" valign="top">Grade 1, low-grade</td><td align="left" valign="top">42</td></tr><tr><td align="left" valign="top">cancer_stage</td><td align="left" valign="top">Information about the cancer&#x2019;s extent in the body according to specific type of cancer staging system</td><td align="left" valign="top">TNM<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup> system, T0, stage I</td><td align="left" valign="top">59.33</td></tr><tr><td align="left" valign="top">clinical_trial</td><td align="left" valign="top">The type of clinical trial or Clinicaltrials.org identifier</td><td align="left" valign="top">Phase II, prospective randomized clinical trials</td><td align="left" valign="top">60.67</td></tr><tr><td align="left" valign="top">host_strain</td><td align="left" valign="top">The name of the mouse host strain where the tissue sample was engrafted for generating the PDX model</td><td align="left" valign="top">NOD-SCID<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td><td align="left" valign="top">61.67</td></tr><tr><td align="left" valign="top">model_id</td><td align="left" valign="top">ID of the patient-derived cancer model generated in this study</td><td align="left" valign="top">PHLC402</td><td align="left" valign="top">100</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>IAA: interannotator agreement.</p></fn><fn id="table1fn2"><p><sup>b</sup>TNBC: triple-negative breast cancer.</p></fn><fn id="table1fn3"><p><sup>c</sup>PDX: patient-derived xenograft.</p></fn><fn id="table1fn4"><p><sup>d</sup>BRCA1: breast cancer gene 1.</p></fn><fn id="table1fn5"><p><sup>e</sup>IDH: isocitrate dehydrogenase.</p></fn><fn id="table1fn6"><p><sup>f</sup>TNM: tumor node metastasis.</p></fn><fn id="table1fn7"><p><sup>g</sup>NOD-SCID: nonobese diabetic severe combined immunodeficiency.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2"><title>Corpus</title><p>We used 100 abstracts to develop the gold-standard corpus annotated for the 15 entities (<xref ref-type="table" rid="table1">Table 1</xref>). The abstracts were chosen from publications linked to the PDCMs submitted to CancerModels.Org platform. They were selected to cover all 3 types of models in the resource-PDXs, organoids, and cell lines. The final corpus is available on GitHub (see Data and Code Availability section).</p><p>Three annotators (ZP, TM, and EL) independently labeled entities in all 100 abstracts for a total of 40 hours. The annotation quality was tracked through interannotator agreement (IAA), a measure of agreement between each annotation produced by different annotators working on the same dataset. The IAA is an indication of how difficult the task is for humans and it becomes the target for system development. We used pairwise <italic>F</italic><sub>1</sub>-score as the IAA metric [<xref ref-type="bibr" rid="ref21">21</xref>] in the exact match setting that requires the spans of the annotators to match exactly. We computed the agreement between each pair of annotators and averaged across the 3 sets of scores. The final IAA for each entity type is reported in <xref ref-type="table" rid="table1">Table 1</xref>. The IAA range is 42&#x2010;100 indicating moderate agreement. Note that the lowest agreement is for low occurrence entity types, for example, cancer_grade has only 8 instances with 42 IAA. These low-frequency entity types are more likely to be overlooked by the human experts as annotation is a cognitively demanding task. Thus, to ensure a high-quality gold-standard dataset, we overlayed the single annotations with an adjudication step, where the annotators discussed annotation disagreements and potential missed annotations to come to final joint decisions. The resulting gold dataset spans 15 entity types for a total 3313 entity mentions (refer <xref ref-type="table" rid="table2">Table 2</xref> for distributions) was split into training, development, and test sets in the standard 60:20:20 ratio. The train set was used for creating entity extraction algorithms, the development set for refining the algorithms, and the test set for the final evaluation.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Distribution of entity type annotations across training, development, and test sets.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Entity type</td><td align="left" valign="bottom">Training, n</td><td align="left" valign="bottom">Development, n</td><td align="left" valign="bottom">Test, n</td><td align="left" valign="bottom">Total, n</td></tr></thead><tbody><tr><td align="left" valign="top">diagnosis</td><td align="left" valign="top">362</td><td align="left" valign="top">122</td><td align="left" valign="top">114</td><td align="left" valign="top">598</td></tr><tr><td align="left" valign="top">age_category</td><td align="left" valign="top">19</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">19</td></tr><tr><td align="left" valign="top">genetic_effect</td><td align="left" valign="top">69</td><td align="left" valign="top">20</td><td align="left" valign="top">33</td><td align="left" valign="top">122</td></tr><tr><td align="left" valign="top">model_type</td><td align="left" valign="top">326</td><td align="left" valign="top">114</td><td align="left" valign="top">110</td><td align="left" valign="top">550</td></tr><tr><td align="left" valign="top">molecular_char</td><td align="left" valign="top">128</td><td align="left" valign="top">37</td><td align="left" valign="top">46</td><td align="left" valign="top">211</td></tr><tr><td align="left" valign="top">biomarker</td><td align="left" valign="top">503</td><td align="left" valign="top">118</td><td align="left" valign="top">163</td><td align="left" valign="top">784</td></tr><tr><td align="left" valign="top">treatment</td><td align="left" valign="top">426</td><td align="left" valign="top">77</td><td align="left" valign="top">130</td><td align="left" valign="top">633</td></tr><tr><td align="left" valign="top">response_to _treatment</td><td align="left" valign="top">99</td><td align="left" valign="top">21</td><td align="left" valign="top">28</td><td align="left" valign="top">148</td></tr><tr><td align="left" valign="top">sample_type</td><td align="left" valign="top">22</td><td align="left" valign="top">8</td><td align="left" valign="top">7</td><td align="left" valign="top">37</td></tr><tr><td align="left" valign="top">tumor_type</td><td align="left" valign="top">61</td><td align="left" valign="top">19</td><td align="left" valign="top">28</td><td align="left" valign="top">108</td></tr><tr><td align="left" valign="top">cancer_grade</td><td align="left" valign="top">6</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">cancer_stage</td><td align="left" valign="top">7</td><td align="left" valign="top">1</td><td align="left" valign="top">4</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">clinical_trial</td><td align="left" valign="top">35</td><td align="left" valign="top">2</td><td align="left" valign="top">4</td><td align="left" valign="top">41</td></tr><tr><td align="left" valign="top">host_strain</td><td align="left" valign="top">9</td><td align="left" valign="top">0</td><td align="left" valign="top">7</td><td align="left" valign="top">16</td></tr><tr><td align="left" valign="top">model_id</td><td align="left" valign="top">17</td><td align="left" valign="top">2</td><td align="left" valign="top">7</td><td align="left" valign="top">26</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">2089</td><td align="left" valign="top">542</td><td align="left" valign="top">682</td><td align="left" valign="top">3313</td></tr></tbody></table></table-wrap></sec><sec id="s2-3"><title>Prompting Methods</title><p>Various prompting techniques have been proposed since the emergence of LLMs [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. At a high level, these prompting techniques can be divided into 2 categories, direct prompting [<xref ref-type="bibr" rid="ref4">4</xref>] and soft prompting [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref26">26</xref>] . The main difference between the two methods is the prompt representation, that is whether the prompt consists of human language words or vectors (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Direct prompting (or discrete prompting) is the most intuitive and now classic prompting method where users directly interact with LLMs using natural language. For example, a user may ask ChatGPT to &#x201C;Write a thank you note to an old friend of my parents&#x201D;; in this case, the text within the quotation marks is a discrete prompt. Soft prompting (or continuous prompting) uses a machine learning approach to train a sequence of continuous vectors, which are the &#x201C;virtual tokens&#x201D; of the prompt. It is worth noting that soft prompting differs from fine tuning. With soft prompting, the LLM parameters are not updated, only the soft prompt parameters are adjusted. In contrast, finetuning requires to update the parameters of the entire LLM, and therefore needs more computation resources. Both prompting techniques have their advantages and disadvantages. Compared to direct prompting, soft prompting does not require the tedious process of manually creating prompts; however, it requires some labeled data to train the prompt. In this work, we explore both direct and soft prompting as we aim to explore the latest developments in LLMs and prompting techniques for the task of extracting PDCM entities from abstracts of academic papers.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Illustration of the 2 prompting methods. In direct prompting, a prompt contains a sequence of words. In soft prompting, a prompt consists of a list of vectors. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70706_fig01.png"/></fig></sec><sec id="s2-4"><title>Direct Prompting</title><p>When asking LLMs to extract entities such as diagnoses or biomarkers, the most intuitive way is to ask LLMs to output the entities directly. In example 1 below, &#x201C;ALK&#x201D; is a biomarker entity. One may expect the model to output <italic>{&#x201C;biomarker&#x201D; [ALK]}</italic>. However, we note that the string &#x201C;ALK&#x201D; is mentioned multiple times in this example text, therefore it is not clear which &#x201C;ALK&#x201D; the model refers to. To get the most precise extraction to facilitate a more fine-grained analysis, we instruct the model to output the offsets of the specific mentions in the text (ie, the spans). For instance, if the model gives us <italic>[(48, 51, &#x201C;ALK,&#x201D; biomarker), (323, 326, &#x201C;ALK,&#x201D; biomarker), &#x2026;]</italic>, we know that from character 48 to character 51, there is a biomarker entity, &#x201C;ALK.&#x201D; Similarly, we can find another biomarker entity &#x201C;ALK&#x201D; at position 323&#x2010;326.</p><disp-quote><p>Example 1:</p><p>Oncogenic fusion of anaplastic lymphoma kinase (ALK) with echinoderm microtubule associated protein like 4 protein or other partner genes occurs in 3 to 6% of lung adenocarcinomas. Although fluorescence in situ hybridization (FISH) is the accepted standard for detecting anaplastic lymphoma receptor tyrosine kinase gene (ALK) gene rearrangement that gives rise to new fusion genes, not all ALK FISH-positive patients respond to ALK inhibitor therapies.</p></disp-quote><p>We started our exploration by designing prompts with an explicit instruction to specify the character offsets of each entity along with the entity text and type (eg, <italic>48, 51, &#x201C;ALK&#x201D;, biomarker</italic>). However, our experiments show that it was challenging for the LLM to output the correct character offsets, a seemingly straightforward task (all the model needs to do is to count the number of characters); however, the complexity of this seemingly straightforward task is likely due to the LLM&#x2019;s way of breaking words outside its vocabulary into so-called word pieces, for example, &#x201C;organoid&#x201D; is broken down into 2 word pieces &#x201C;organ&#x201D; and &#x201C;-oid.&#x201D; Considering that LLMs were trained as generative models [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], we subsequently cast the entity extraction task as a generation task, where we instructed the model to mark the entities with XML tags. For instance, if the model outputs &#x201C;Oncogenic fusion of anaplastic lymphoma kinase (&#x003C;biomarker&#x003E;ALK&#x003C;/biomarker&#x003E;) with echinoderm microtubule &#x2026;,&#x201D; then postprocessing the output with regular expressions would find the exact position of &#x201C;ALK&#x201D; in the text. Specifically, we asked the LLMs to mark the start and end of an entity with &#x003C;entity_type&#x003E; and &#x003C;/entity_type&#x003E; tags, where entity_type is a placeholder for the specific entity type, such as biomarker or treatment (refer <xref ref-type="table" rid="table1">Table 1</xref> for the full list).</p></sec><sec id="s2-5"><title>Soft Prompting</title><p>Designing the direct prompts manually could be time-consuming and minor changes in the prompt language could lead to drastic changes in the model performance [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. On the other hand, soft prompting requires some amount of gold data for its training and annotating gold data by domain experts could also be time-consuming. Fortunately, only a small set of labeled data are needed to train soft prompts. As described above, we created a gold dataset, which we used for training and evaluating our soft prompting approach.</p><p>There are a few soft prompting methods, the difference usually lies in how the prompt vectors are initialized and learned. Prompt-tuning [<xref ref-type="bibr" rid="ref18">18</xref>] is a technique that learns the prompt by adding a list of virtual tokens (ie, vectors) in front of the input, where the virtual tokens can be randomly initialized, or drawn from a pretrained word embedding [<xref ref-type="bibr" rid="ref28">28</xref>] set. Another method is P-tuning [<xref ref-type="bibr" rid="ref24">24</xref>], which uses small neural networks such as feedforward neural networks [<xref ref-type="bibr" rid="ref29">29</xref>] (multilayer perceptron) or recurrent neural networks [<xref ref-type="bibr" rid="ref30">30</xref>] (eg, long-short term memory) as the prompt encoder to learn the prompt. Only the parameters in the prompt encoder are updated during training, while the weights in the LLMs remain frozen. In our experiments, we found P-tuning did not always converge to an optimal solution for our task perhaps due to the random initialization of the vectors rather than using carefully pretrained word embeddings. Therefore, we focused on prompt-tuning in this work. Following Lester et al [<xref ref-type="bibr" rid="ref18">18</xref>], we initialized the vectors in the prompt with the embeddings of the label words in the entity type set (<xref ref-type="table" rid="table1">Table 1</xref>).</p><p>The standard approach for entity extraction in natural language processing is via token classification [<xref ref-type="bibr" rid="ref31">31</xref>]. Concretely, a classifier is trained to predict the label for each token in a sentence according to a predefined label set. Additionally, each label is prepended with a B or I prefix to indicate the entity&#x2019;s Beginning or Inside mention, respectively. An example is provided in <xref ref-type="fig" rid="figure2">Figure 2</xref>. &#x201C;Ewing sarcoma&#x201D; is an entity mention of the diagnosis type. Thus &#x201C;Ewing&#x201D; and &#x201C;sarcoma&#x201D; are labeled as &#x201C;Diagnosis,&#x201D; while all other tokens are labeled as &#x201C;O,&#x201D; meaning they are Outside of an entity. To be more precise, &#x201C;Ewing&#x201D; is at the beginning of the diagnosis entity, and &#x201C;sarcoma&#x201D; is inside of the entity, so they are labeled as &#x201C;B-Diagnosis&#x201D; and &#x201C;I<bold>-</bold>Diagnosis,&#x201D; respectively.</p><p>To summarize, we trained a multiclass classifier for the soft-prompting training step. There are 15 entity types in our dataset, therefore there are 15&#x00D7;2+1=31 labels for token classification, with one extra label for &#x201C;O.&#x201D;</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>An example of entity extraction as token classification.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70706_fig02.png"/></fig></sec><sec id="s2-6"><title>Experimental Set-Up</title><p>For efficiency purposes, we used Apache cTAKES [<xref ref-type="bibr" rid="ref32">32</xref>] to split an abstract into sentences which were then passed to the LLMs to extract entities one sentence at a time. Our direct prompt included the instruction, the definition of each entity type, 5 examples (few-shot in-context learning) and the query (the sentence). The in-context learning [<xref ref-type="bibr" rid="ref4">4</xref>] is a common practice in LLM prompting and has consistently shown improved results as the examples guide the LLM onto an optimal path [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. <xref ref-type="fig" rid="figure3">Figure 3</xref> presents our prompt template, and examples are in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Prompt template used in direct prompting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70706_fig03.png"/></fig><p>When choosing the LLMs, we used GPT-4o [<xref ref-type="bibr" rid="ref35">35</xref>], one of the most powerful proprietary LLMs at the time of this study, and SOTA open LLMs from the LLaMA3 family [<xref ref-type="bibr" rid="ref36">36</xref>], including LLaMA3.1 70B, LLaMA3.1 8B, LLaMA3.2 1B, and LLaMA3.2 3B. We did not use GPT-4o or LLaMA3.1 70B to train the soft prompts due to computational resource limitations; thus, our work here is representative of the computational environment in the vast majority of academic medical centers and research labs. We set the soft prompt length to 30. We trained the soft prompt on the training set for 50 epochs with a learning rate of 0.001. Hyperparameters were tuned on the development set using the LLaMA3.1 8B model.</p><p>We report the evaluation results on the test set in the next section. In addition, we apply 5-fold cross-validation and report the average scores with SDs. For the 5-fold cross-validation, we excluded the 3 abstracts used to sample the gold examples for direct prompting and split the remaining 97 abstracts into 5 folds with a 20:20:20:20:17 ratio. For direct prompting, we ran the model on each fold and reported the average scores. For soft prompting, we set aside one fold as the test set and trained the soft prompts on the remaining 4 folds.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>We used the standard evaluation metrics of precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (the harmonic mean of precision and recall) with 2 evaluation settings: &#x201C;exact match&#x201D; setting requires the span output from the model to exactly match the span of the gold annotation, and &#x201C;overlapping match&#x201D; setting allows the model to get partial credit if its extraction overlaps the spans in the gold annotation. For example, the model may extract &#x201C;patient-derived tumor xenograft (PDX)&#x201D; as a model_type entity, while the gold annotation is &#x201C;patient-derived tumor xenograft (PDX) models.&#x201D; Under the &#x201C;exact match&#x201D; setting, &#x201C;patient-derived tumor xenograft (PDX)&#x201D; is NOT a match to &#x201C;patient-derived tumor xenograft (PDX) models;&#x201D; while under the &#x201C;overlapping match&#x201D; setting, it is a match since the spans overlap.</p><p><xref ref-type="table" rid="table3">Tables 3</xref> and <xref ref-type="table" rid="table4">4</xref> show the evaluation results on the test set. In <xref ref-type="table" rid="table3">Table 3</xref>, we can see that under the &#x201C;exact match&#x201D; setting, GPT-4o direct prompting achieves the highest <italic>F</italic><sub>1</sub>-score of 50.48. The performances of the LLaMA3 family models drop as the model size decreases, with <italic>F</italic><sub>1</sub>-score from 38.40 for the 70B model to 6.78 for the 1B model. However, there is a consistent improvement in <italic>F</italic><sub>1</sub>-scores with soft prompting over direct prompting. For the LLaMA3.2 models, the performance of the 3B model improves significantly, with <italic>F</italic><sub>1</sub>-score rising from 7.06 to 46.68 <italic>F</italic><sub>1</sub>-score&#x2014;more than 8 points higher than the LLaMA3.1-70B model with direct prompting (<italic>F</italic><sub>1</sub>-score=38.40), despite the substantial difference in model size.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Evaluation results on the test set (exact match) as precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Exact match</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Direct prompting</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="left" valign="top">56.09</td><td align="left" valign="top">45.89<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">50.48<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-70B</td><td align="left" valign="top">57.27<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">28.89</td><td align="left" valign="top">38.40</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">35.80</td><td align="left" valign="top">18.48</td><td align="left" valign="top">24.37</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">25.23</td><td align="left" valign="top">4.10</td><td align="left" valign="top">7.06</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">23.48</td><td align="left" valign="top">3.96</td><td align="left" valign="top">6.78</td></tr><tr><td align="left" valign="top" colspan="4">Soft prompting</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">47.17</td><td align="left" valign="top">45.75</td><td align="left" valign="top">46.44</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">47.30<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">46.09<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">46.68<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">46.19</td><td align="left" valign="top">45.01</td><td align="left" valign="top">45.59</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>These are the best results.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Evaluation results on the test set (overlapping match) as precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Overlapping match</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Direct prompting</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="left" valign="top">76.96</td><td align="left" valign="top">66.52<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">71.36<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-70B</td><td align="left" valign="top">77.95<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">43.99</td><td align="left" valign="top">56.24</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">50.54</td><td align="left" valign="top">27.49</td><td align="left" valign="top">35.61</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">41.03</td><td align="left" valign="top">7.03</td><td align="left" valign="top">12.00</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">35.34</td><td align="left" valign="top">6.01</td><td align="left" valign="top">10.28</td></tr><tr><td align="left" valign="top" colspan="4">Soft prompting</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">71.19</td><td align="left" valign="top">70.53</td><td align="left" valign="top">70.86</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">72.05<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">71.55<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">71.80<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">70.38</td><td align="left" valign="top">70.48</td><td align="left" valign="top">70.42</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>These are the best results.</p></fn></table-wrap-foot></table-wrap><p>Similar trends are observed in <xref ref-type="table" rid="table4">Table 4</xref> under the &#x201C;overlapping match&#x201D; evaluation. GPT4-o shows an <italic>F</italic><sub>1</sub>-score performance of 71.36, maintaining its position as the top performer for direct prompting. The 3 smaller LLaMA3 models continue to benefit from soft prompting, with the LLaMA3.2 3B model achieving slightly higher score than GPT4-o with direct prompting (<italic>F</italic><sub>1</sub>-scores of 71.80 vs 71.36 ).</p><p><xref ref-type="table" rid="table5">Tables 5</xref> and <xref ref-type="table" rid="table6">6</xref> present the results with 5-fold cross-validation under &#x201C;exact match&#x201D; and &#x201C;overlapping&#x201D; match respectively. Once again, our observations indicate that with soft prompting, the smaller LLaMA models attain performance levels comparable to GPT-4o.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Five-fold cross-validation results (exact match) as precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall).</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Exact match</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Direct prompting, mean (SD)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="left" valign="top">60.73 (2.69)</td><td align="left" valign="top">49.92 (3.46)</td><td align="left" valign="top">54.75 (2.84)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-70B</td><td align="left" valign="top">57.56 (1.53)</td><td align="left" valign="top">31.70 (1.24)</td><td align="left" valign="top">40.87 (1.25)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">38.29 (3.29)</td><td align="left" valign="top">20.57 (2.18)</td><td align="left" valign="top">26.75 (2.61)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">27.01 (3.20)</td><td align="left" valign="top">5.25 (0.80)</td><td align="left" valign="top">8.80 (1.29)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">9.84 (5.98)</td><td align="left" valign="top">0.74 (0.47)</td><td align="left" valign="top">1.38 (0.87)</td></tr><tr><td align="left" valign="top" colspan="4">Soft prompting, mean (SD)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">51.76 (3.09)</td><td align="left" valign="top">50.21 (2.24)</td><td align="left" valign="top">50.94 (2.55)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">50.99 (2.43)</td><td align="left" valign="top">49.54 (2.98)</td><td align="left" valign="top">50.24 (2.53)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">49.34 (3.47)</td><td align="left" valign="top">49.98 (3.19)</td><td align="left" valign="top">49.13 (3.10)</td></tr></tbody></table></table-wrap><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Five-fold cross-validation results (overlapping match) as precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall).</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Overlapping match</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Direct prompting, mean (SD)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o</td><td align="left" valign="top">77.82 (2.54)</td><td align="left" valign="top">67.52 (2.17)</td><td align="left" valign="top">72.28 (1.88)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-70B</td><td align="left" valign="top">78.01 (1.14)</td><td align="left" valign="top">47.77 (0.71)</td><td align="left" valign="top">59.25 (0.81)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">52.75 (3.02)</td><td align="left" valign="top">29.78 (2.60)</td><td align="left" valign="top">38.04 (2.84)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">42.42 (2.89)</td><td align="left" valign="top">8.64 (1.09)</td><td align="left" valign="top">14.34 (1.64)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">22.09 (5.74)</td><td align="left" valign="top">1.67 (0.54)</td><td align="left" valign="top">3.10 (0.99)</td></tr><tr><td align="left" valign="top" colspan="4">Soft prompting, mean (SD)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.1-8B</td><td align="left" valign="top">73.78 (3.09)</td><td align="left" valign="top">73.77 (1.25)</td><td align="left" valign="top">73.75 (2.06)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-3B</td><td align="left" valign="top">73.48 (1.97)</td><td align="left" valign="top">73.51 (1.11)</td><td align="left" valign="top">73.48 (1.31)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLaMA3.2-1B</td><td align="left" valign="top">71.51 (3.43)</td><td align="left" valign="top">73.25 (2.46)</td><td align="left" valign="top">72.34 (2.63)</td></tr></tbody></table></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our experiments demonstrate that soft prompting, a relatively underexplored aspect of LLM prompting, can significantly enhance the performance of smaller LLMs. The 3 LLaMA models exhibit comparable performance under soft prompting (an <italic>F</italic><sub>1</sub>-score of 46 in the exact match setting, and 70 in the overlapping match setting). These results are particularly promising results given the limited training data, consisting of 60 abstracts with 2089 entity mentions. Please note that all <italic>F</italic><sub>1</sub>-scores mentioned in this section refer to the <italic>F</italic><sub>1</sub>-scores on the test set.</p><p>How much data is needed to train the soft prompt? To answer this question, we trained LLaMA3.2 1B model, the smallest model used in this work, with different amounts of training data. <xref ref-type="fig" rid="figure4">Figure 4</xref> shows the relation between the proportion of training data and the <italic>F</italic><sub>1</sub>-scores on the test set (overlapping match). Solid performance was achieved with only 5% of the training data (26 sentences from 3 abstracts). With 25% of the training data (129 sentences from 15 abstracts), the model achieved an <italic>F</italic><sub>1</sub>-score of 68.21, only 2 points lower than using the entire training set, and only 3 points lower than GPT4-o with direct prompting. Despite the impressive performance of GPT4-o direct prompting, one potential issue is that not all data used in biomedical research can be sent to proprietary models such as GPT or the Gemini family models [<xref ref-type="bibr" rid="ref8">8</xref>] via public application programming interfaces. That is, for applications using real patient data that require Health Insurance Portability and Accountability Act&#x2013;compliant platforms, our findings demonstrate that achieving performance comparable to proprietary LLMs such as GPT4-o remains feasible through soft prompting. However, this approach necessitates a tradeoff, requiring a small set of labeled data for optimal effectiveness.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Performance curve of the LLaMA3.2 1B model as the size of training data increases.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e70706_fig04.png"/></fig><p>Some entities appear more frequently than other entities in our dataset. For example, diagnosis and treatment mentions are more frequent than mentions of cancer_grade. In <xref ref-type="table" rid="table7">Table 7</xref>, we present the number of instances of each entity type in our dataset and the corresponding performance of GPT4-o direct prompting. We can see that GPT4-o performs the best for the entity types that have the most instances&#x2014;diagnosis, model type, and treatment entities. Of these frequent entity types, biomarker is the one with the lowest performance. Our error analysis points to several factors that could have contributed to these results, including ambiguous and inconsistent mentions and contextual dependencies. In this task, we defined a biomarker as &#x201C;gene, protein or biological molecule identified in or associated with patient&#x2019;s/model&#x2019;s tumor.&#x201D; Thus, biomarker entities can be mentioned using their full names (eg, epidermal growth factor receptor, lnc-RP11-536 K7.3, echinoderm microtubule-associated protein-like 4), standardized gene or protein symbols (<italic>NPM1</italic>, KRAS, PTEN) or abbreviations of metabolites (NADPH, D2HG). Moreover, a biomarker entity (eg, &#x201C;MEK&#x201D;) often overlaps with a treatment entity (eg, &#x201C;MEK inhibitor&#x201D;). The ambiguity in biomarker entity mentions might interfere with the model&#x2019;s ability to recognize them consistently. In addition, biomarker entities are often mentioned as lists (see Example 2) resulting in a different frequency within and across the abstracts and patterns of entity mentions, in comparison with other entities. Overall, ambiguity emerges as the primary source of error. More precise definitions, accompanied by examples illustrating the distinct meanings, might present a solution. Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides the breakdown of errors per entity type along with examples.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Evaluation results of GPT4-o with direct prompts for each entity type as precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall). Results are overlapping match setting on the test set.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Entity type</td><td align="left" valign="bottom">Training instances, n</td><td align="left" valign="bottom">Development instances, n</td><td align="left" valign="bottom">Test instances, n</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">IAA<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">diagnosis</td><td align="left" valign="top">362</td><td align="left" valign="top">122</td><td align="left" valign="top">114</td><td align="left" valign="top">92.47</td><td align="left" valign="top">75.44</td><td align="left" valign="top">83.09<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">61.67</td></tr><tr><td align="left" valign="top">age_category</td><td align="left" valign="top">19</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">60</td></tr><tr><td align="left" valign="top">genetic_effect</td><td align="left" valign="top">69</td><td align="left" valign="top">20</td><td align="left" valign="top">33</td><td align="left" valign="top">45.71</td><td align="left" valign="top">47.06</td><td align="left" valign="top">46.38</td><td align="left" valign="top">57.67</td></tr><tr><td align="left" valign="top">model_type</td><td align="left" valign="top">326</td><td align="left" valign="top">114</td><td align="left" valign="top">110</td><td align="left" valign="top">88.07</td><td align="left" valign="top">84.21</td><td align="left" valign="top">86.10<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">53.33</td></tr><tr><td align="left" valign="top">molecular_char</td><td align="left" valign="top">128</td><td align="left" valign="top">37</td><td align="left" valign="top">46</td><td align="left" valign="top">65.22</td><td align="left" valign="top">63.83</td><td align="left" valign="top">64.52<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">54.33</td></tr><tr><td align="left" valign="top">biomarker</td><td align="left" valign="top">503</td><td align="left" valign="top">118</td><td align="left" valign="top">163</td><td align="left" valign="top">85.05</td><td align="left" valign="top">55.49</td><td align="left" valign="top">67.16<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">61.33</td></tr><tr><td align="left" valign="top">treatment</td><td align="left" valign="top">426</td><td align="left" valign="top">77</td><td align="left" valign="top">130</td><td align="left" valign="top">81.74</td><td align="left" valign="top">70.15</td><td align="left" valign="top">75.50<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">55.67</td></tr><tr><td align="left" valign="top">response_to _treatment</td><td align="left" valign="top">99</td><td align="left" valign="top">21</td><td align="left" valign="top">28</td><td align="left" valign="top">38.64</td><td align="left" valign="top">60.71</td><td align="left" valign="top">47.22</td><td align="left" valign="top">55</td></tr><tr><td align="left" valign="top">sample_type</td><td align="left" valign="top">22</td><td align="left" valign="top">8</td><td align="left" valign="top">7</td><td align="left" valign="top">45.45</td><td align="left" valign="top">71.43</td><td align="left" valign="top">55.56<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">49</td></tr><tr><td align="left" valign="top">tumor_type</td><td align="left" valign="top">61</td><td align="left" valign="top">19</td><td align="left" valign="top">28</td><td align="left" valign="top">66.67</td><td align="left" valign="top">57.14</td><td align="left" valign="top">61.54<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">49.67</td></tr><tr><td align="left" valign="top">cancer_grade</td><td align="left" valign="top">6</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">50.0</td><td align="left" valign="top">100</td><td align="left" valign="top">66.67<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">42</td></tr><tr><td align="left" valign="top">cancer_stage</td><td align="left" valign="top">7</td><td align="left" valign="top">1</td><td align="left" valign="top">4</td><td align="left" valign="top">33.33</td><td align="left" valign="top">25.0</td><td align="left" valign="top">28.57</td><td align="left" valign="top">59.33</td></tr><tr><td align="left" valign="top">clinical_trial</td><td align="left" valign="top">35</td><td align="left" valign="top">2</td><td align="left" valign="top">4</td><td align="left" valign="top">80.0</td><td align="left" valign="top">100</td><td align="left" valign="top">88.89<sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></td><td align="left" valign="top">60.67</td></tr><tr><td align="left" valign="top">host_strain</td><td align="left" valign="top">9</td><td align="left" valign="top">0</td><td align="left" valign="top">7</td><td align="left" valign="top">100</td><td align="left" valign="top">28.57</td><td align="left" valign="top">44.44</td><td align="left" valign="top">61.67</td></tr><tr><td align="left" valign="top">model_id</td><td align="left" valign="top">17</td><td align="left" valign="top">2</td><td align="left" valign="top">7</td><td align="left" valign="top">66.67</td><td align="left" valign="top">28.57</td><td align="left" valign="top">40.0</td><td align="left" valign="top">100</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>IAA: interannotator agreement.</p></fn><fn id="table7fn2"><p><sup>b</sup><italic>F</italic><sub>1</sub>-scores exceeding interannotator agreement.</p></fn></table-wrap-foot></table-wrap><disp-quote><p>Example 2:</p><p>Genomic alterations involved RB1 (55%), TP53 (46%), PTEN (29%), BRCA2 (29%), and AR (27%), and there was a range of androgen receptor signaling and NEPC marker expression.</p></disp-quote><p>The moderate performance of entity types such as genetic_effect, molecular_char, and response_to_treatment, and tumour_type is due to the number of training instances ranging from 61 to 128 as well as the IAA ranging from 49.67 to 57.67. The moderate IAA scores of those entity types underscore the need for refined annotation protocols and modeling strategies that better capture domain-specific knowledge. Furthermore, the lower performance observed for entity types with smaller sample sizes (eg, model_id) highlights the need for enhancing model performance on low-frequency labels. Future research could explore strategies such as data augmentation to improve the model&#x2019;s generalizability for underrepresented entities.</p><p>The extraction of PDCM-relevant knowledge is not an easy task for the domain experts as indicated by the IAA (<italic>F</italic><sub>1</sub>-score below 65 for all entity types except for model_id). In 9 out of 15 entity types, the system performance in an overlapping match setting exceeds the IAA (last two columns of <xref ref-type="table" rid="table7">Table 7</xref>). This is the case for categories with plentiful training instances (eg, diagnosis, model_type) as well as for categories with fewer training instances (eg, sample_type, cancer_grade). For the exact match setting, in 6 out of 15 entity types, the system performance exceeds the IAA (last two columns in <xref ref-type="table" rid="table8">Table 8</xref>). Therefore, the LLM could be a viable assistant, with its outputs reviewed by a domain expert to ensure the accuracy of the finalextraction. We believe such human-in-the-loop approaches present a promising direction for future research and application.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Evaluation results of GPT4-o with direct prompts for each entity type as precision or positive predictive value, recall or sensitivity, and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall). Results are exact match setting on the test set.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Entity type</td><td align="left" valign="bottom">Training instances, n</td><td align="left" valign="bottom">Development instances, n</td><td align="left" valign="bottom">Test instances, n</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="bottom">IAA<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">diagnosis</td><td align="left" valign="top">362</td><td align="left" valign="top">122</td><td align="left" valign="top">114</td><td align="left" valign="top">77.17</td><td align="left" valign="top">62.28</td><td align="left" valign="top">68.93<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td><td align="left" valign="top">61.67</td></tr><tr><td align="left" valign="top">age_category</td><td align="left" valign="top">19</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">0.0</td><td align="left" valign="top">60.0</td></tr><tr><td align="left" valign="top">genetic_effect</td><td align="left" valign="top">69</td><td align="left" valign="top">20</td><td align="left" valign="top">33</td><td align="left" valign="top">25.71</td><td align="left" valign="top">27.27</td><td align="left" valign="top">26.47</td><td align="left" valign="top">57.67</td></tr><tr><td align="left" valign="top">model_type</td><td align="left" valign="top">326</td><td align="left" valign="top">114</td><td align="left" valign="top">110</td><td align="left" valign="top">56.88</td><td align="left" valign="top">56.36</td><td align="left" valign="top">56.62<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td><td align="left" valign="top">53.33</td></tr><tr><td align="left" valign="top">molecular_char</td><td align="left" valign="top">128</td><td align="left" valign="top">37</td><td align="left" valign="top">46</td><td align="left" valign="top">54.35</td><td align="left" valign="top">54.35</td><td align="left" valign="top">54.35<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td><td align="left" valign="top">54.33</td></tr><tr><td align="left" valign="top">biomarker</td><td align="left" valign="top">503</td><td align="left" valign="top">118</td><td align="left" valign="top">163</td><td align="left" valign="top">46.74</td><td align="left" valign="top">26.38</td><td align="left" valign="top">33.73</td><td align="left" valign="top">61.33</td></tr><tr><td align="left" valign="top">treatment</td><td align="left" valign="top">426</td><td align="left" valign="top">77</td><td align="left" valign="top">130</td><td align="left" valign="top">72.34</td><td align="left" valign="top">52.31</td><td align="left" valign="top">60.71<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td><td align="left" valign="top">55.67</td></tr><tr><td align="left" valign="top">response_to _treatment</td><td align="left" valign="top">99</td><td align="left" valign="top">21</td><td align="left" valign="top">28</td><td align="left" valign="top">27.91</td><td align="left" valign="top">42.86</td><td align="left" valign="top">33.80</td><td align="left" valign="top">55</td></tr><tr><td align="left" valign="top">sample_type</td><td align="left" valign="top">22</td><td align="left" valign="top">8</td><td align="left" valign="top">7</td><td align="left" valign="top">45.45</td><td align="left" valign="top">71.43</td><td align="left" valign="top">55.56<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td><td align="left" valign="top">49</td></tr><tr><td align="left" valign="top">tumor_type</td><td align="left" valign="top">61</td><td align="left" valign="top">19</td><td align="left" valign="top">28</td><td align="left" valign="top">50.0</td><td align="left" valign="top">39.29</td><td align="left" valign="top">44.0</td><td align="left" valign="top">49.67</td></tr><tr><td align="left" valign="top">cancer_grade</td><td align="left" valign="top">6</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">50.0</td><td align="left" valign="top">100</td><td align="left" valign="top">66.67<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td><td align="left" valign="top">42</td></tr><tr><td align="left" valign="top">cancer_stage</td><td align="left" valign="top">7</td><td align="left" valign="top">1</td><td align="left" valign="top">4</td><td align="left" valign="top">33.33</td><td align="left" valign="top">25.0</td><td align="left" valign="top">28.57</td><td align="left" valign="top">59.33</td></tr><tr><td align="left" valign="top">clinical_trial</td><td align="left" valign="top">35</td><td align="left" valign="top">2</td><td align="left" valign="top">4</td><td align="left" valign="top">40.0</td><td align="left" valign="top">50.0</td><td align="left" valign="top">44.44</td><td align="left" valign="top">60.67</td></tr><tr><td align="left" valign="top">host_strain</td><td align="left" valign="top">9</td><td align="left" valign="top">0</td><td align="left" valign="top">7</td><td align="left" valign="top">100</td><td align="left" valign="top">14.29</td><td align="left" valign="top">25.0</td><td align="left" valign="top">61.67</td></tr><tr><td align="left" valign="top">model_id</td><td align="left" valign="top">17</td><td align="left" valign="top">2</td><td align="left" valign="top">7</td><td align="left" valign="top">66.67</td><td align="left" valign="top">28.27</td><td align="left" valign="top">40.0</td><td align="left" valign="top">100</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>IAA: interannotator agreement.</p></fn><fn id="table8fn2"><p><sup>b</sup><italic>F</italic><sub>1</sub>-scores exceeding the interannotator agreement.</p></fn></table-wrap-foot></table-wrap><p>We would like to note that the work presented in the paper was done in a computational environment representative of the vast majority of academic medical centers and nonindustry research labs. Although we have access to SOTA Graphics Processing Units, we still found ourselves constrained as to the extent to which we could use very large language models. The larger community needs to address the growing gap in computational resources between big tech and the rest of the research community.</p></sec><sec id="s4-2"><title>Limitations</title><p>As this is a feasibility study, we limited ourselves to the extraction of entity mentions of 15 entity types chosen from attributes in the descriptive standards for PDCMs. While these are recognized by the PDCM and oncology community, they do not cover all knowledge in the PDCM-relevant texts. Some refinement of the entity types will be beneficial to improve prompting results.</p><p>We limited our corpus to 100 abstracts from papers associated with PDCMs deposited in CancerModels.Org. We did not assess the abstracts for the presence and equal distribution of all the entities. Thus, there were very few mentions of some entities in the corpus (eg, cancer_stage), negatively affecting our overall <italic>F</italic><sub>1</sub>-score. We decided not to exclude those entities as these results could guide refinements of future studies. The computational methods discussed here are applicable to other studies requiring the extraction of textual information from scientific papers. Future work could involve extending this method to extract knowledge from the main body of the papers.</p></sec><sec id="s4-3"><title>Conclusions</title><p>This study investigates the potential of LLMs as powerful tools for extracting PDCM-relevant knowledge from scientific literature&#x2014;an essential task for advancing cancer research and precision medicine. By comparing direct and soft prompting across both proprietary and open LLMs, we provide valuable insights into the most effective strategies for PDCM-relevant knowledge extraction. Our findings indicate that GPT-4o, when used with direct prompting, maintains competitive performance, while soft prompting significantly enhances the effectiveness of smaller LLMs. In conclusion, our results demonstrate that training soft prompts on smaller open models can achieve performance levels comparable to those of proprietary LLMs.</p><p>To our knowledge, this is the first study to implement SOTA LLMs prompting for knowledge extraction in the PDCM domain and the first to explore the emerging topic of soft prompting in this context. Our findings demonstrate that LLMs can effectively streamline the extraction of complex cancer model metadata, potentially reducing the burden of manual curation and accelerating the integration of PDCMs into research and clinical workflows. Additionally, this study lays the foundation for future research aimed at optimizing LLMs for large-scale knowledge extraction tasks. Efficiently extracting and harmonizing PDCM-relevant knowledge will ultimately drive progress in cancer research and precision oncology, equipping researchers and clinicians with better tools to improve patient outcomes. More broadly, our study contributes to the ongoing discourse on the applicability of LLMs, acknowledging that while they offer transformative potential, they are not a universal solution for all tasks.</p></sec></sec></body><back><ack><p>Funding was provided by the US National Institutes of Health (U24CA248010, R01LM013486, U24CA253539) and European Bioinformatics Institute (EMBL-EBI) Core Funds.</p></ack><notes><sec><title>Data Availability</title><p>The data and code will be available upon publication in the CancerModels.Org Github repository [<xref ref-type="bibr" rid="ref37">37</xref>].</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">IAA</term><def><p>interannotator agreement</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">PDCM</term><def><p>patient-derived cancer model</p></def></def-item><def-item><term id="abb4">PDX</term><def><p>patient-derived xenografts</p></def></def-item><def-item><term id="abb5">SOTA</term><def><p>state-of-the-art</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>RePORTER</article-title><source>National Institutes of Health</source><access-date>2024-12-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://reporter.nih.gov/">https://reporter.nih.gov/</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Perova</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Martinez</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mandloi</surname><given-names>T</given-names> </name><etal/></person-group><article-title>PDCM Finder: an open global research platform for patient-derived cancer models</article-title><source>Nucleic Acids Res</source><year>2023</year><month>01</month><day>6</day><volume>51</volume><issue>D1</issue><fpage>D1360</fpage><lpage>D1366</lpage><pub-id pub-id-type="doi">10.1093/nar/gkac1021</pub-id><pub-id pub-id-type="medline">36399494</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Vaswani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Parmar</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Attention is all you need</article-title><source>arXiv</source><access-date>2025-06-12</access-date><comment>Preprint posted online on  Jun 12, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  May 28, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omiye</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Gui</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rezaei</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name></person-group><article-title>Large language models in medicine: the potentials and pitfalls: a narrative review</article-title><source>Ann Intern Med</source><year>2024</year><month>02</month><volume>177</volume><issue>2</issue><fpage>210</fpage><lpage>220</lpage><pub-id pub-id-type="doi">10.7326/M23-2772</pub-id><pub-id pub-id-type="medline">38285984</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Saab</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><etal/></person-group><article-title>Capabilities of Gemini models in medicine</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 29, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.18416</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nayak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rangan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>JH</given-names> </name></person-group><article-title>Diagnostic reasoning prompts reveal the potential for large language model interpretability in medicine</article-title><source>NPJ Digit Med</source><year>2024</year><month>01</month><day>24</day><volume>7</volume><issue>1</issue><fpage>20</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01010-1</pub-id><pub-id pub-id-type="medline">38267608</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>CYK</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>BY</given-names> </name><name name-style="western"><surname>Kornblith</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Butte</surname><given-names>AJ</given-names> </name></person-group><article-title>Evaluating the use of large language models to provide clinical recommendations in the emergency department</article-title><source>Nat Commun</source><year>2024</year><month>10</month><day>8</day><volume>15</volume><issue>1</issue><fpage>8236</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-52415-1</pub-id><pub-id pub-id-type="medline">39379357</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lucas</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Upperman</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>JR</given-names> </name></person-group><article-title>A systematic review of large language models and their implications in medical education</article-title><source>Med Educ</source><year>2024</year><month>11</month><volume>58</volume><issue>11</issue><fpage>1276</fpage><lpage>1285</lpage><pub-id pub-id-type="doi">10.1111/medu.15402</pub-id><pub-id pub-id-type="medline">38639098</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Perot</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Luisier</surname><given-names>F</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ku</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>A</given-names> </name><name name-style="western"><surname>Srikumar</surname><given-names>V</given-names> </name></person-group><article-title>LMDX: language model-based document information extraction and localization</article-title><source>Findings of the Association for Computational Linguistics ACL 2024</source><year>2024</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>15140</fpage><lpage>15168</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.899</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Arsenyan</surname><given-names>V</given-names> </name><name name-style="western"><surname>Bughdaryan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shaya</surname><given-names>F</given-names> </name><name name-style="western"><surname>Small</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Shahnazaryan</surname><given-names>D</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ananiadou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Miwa</surname><given-names>M</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tsujii</surname><given-names>J</given-names> </name></person-group><article-title>Large language models for biomedical knowledge graph construction: information extraction from EMR notes</article-title><source>Proceedings of the 23rd Workshop on Biomedical Natural Language Processing</source><year>2024</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>295</fpage><lpage>317</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.bionlp-1.23</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Munnangi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Feldman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>B</given-names> </name><name name-style="western"><surname>Amir</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hope</surname><given-names>T</given-names> </name><name name-style="western"><surname>Naik</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Duh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gomez</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name></person-group><article-title>On-the-fly definition augmentation of LLMs for biomedical NER</article-title><source>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics</source><year>2024</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>3833</fpage><lpage>3854</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.naacl-long.212</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lester</surname><given-names>B</given-names> </name><name name-style="western"><surname>Al-Rfou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Constant</surname><given-names>N</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Moens</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Specia</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yih</surname><given-names>SW</given-names> </name></person-group><article-title>The power of scale for parameter-efficient prompt tuning</article-title><source>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</source><year>2021</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>3045</fpage><lpage>3059</lpage><pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.243</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meehan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Conte</surname><given-names>N</given-names> </name><name name-style="western"><surname>Goldstein</surname><given-names>T</given-names> </name><etal/></person-group><article-title>PDX-MI: minimal information for patient-derived tumor xenograft models</article-title><source>Cancer Res</source><year>2017</year><month>11</month><day>1</day><volume>77</volume><issue>21</issue><fpage>e62</fpage><lpage>e66</lpage><pub-id pub-id-type="doi">10.1158/0008-5472.CAN-17-0582</pub-id><pub-id pub-id-type="medline">29092942</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>PDCMFinder/MI-standard-in-vitro-models</article-title><source>GitHub</source><access-date>2024-12-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/PDCMFinder/MI-Standard-In-vitro-models">https://github.com/PDCMFinder/MI-Standard-In-vitro-models</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hripcsak</surname><given-names>G</given-names> </name><name name-style="western"><surname>Rothschild</surname><given-names>AS</given-names> </name></person-group><article-title>Agreement, the F-measure, and reliability in information retrieval</article-title><source>J Am Med Inform Assoc</source><year>2005</year><volume>12</volume><issue>3</issue><fpage>296</fpage><lpage>298</lpage><pub-id pub-id-type="doi">10.1197/jamia.M1733</pub-id><pub-id pub-id-type="medline">15684123</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 28, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2201.11903</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Self-consistency improves chain of thought reasoning in language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 21, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.11171</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Du</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>GPT understands, too</article-title><source>arXiv</source><access-date>2025-06-12</access-date><comment>Preprint posted online on  Mar 18, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2103.10385</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Schulhoff</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ilie</surname><given-names>M</given-names> </name><name name-style="western"><surname>Balepur</surname><given-names>N</given-names> </name><etal/></person-group><article-title>The prompt report: a systematic survey of prompting techniques</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.06608</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>XL</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name></person-group><article-title>Prefix-tuning: optimizing continuous prompts for generation</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 1, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2101.00190</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Muresanu A</surname><given-names>I</given-names> </name><name name-style="western"><surname>Han</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Paster</surname><given-names>K</given-names> </name><name name-style="western"><surname>Pitis</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>H</given-names> </name></person-group><article-title>Large language models are human-level prompt engineers</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 3, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2211.01910</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Corrado</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>J</given-names> </name></person-group><article-title>Distributed representations of words and phrases and their compositionality</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 16, 2013</comment><pub-id pub-id-type="doi">10.48550/arXiv.1310.4546</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rosenblatt</surname><given-names>F</given-names> </name></person-group><article-title>The perceptron: a probabilistic model for information storage and organization in the brain</article-title><source>Psychol Rev</source><year>1958</year><month>11</month><volume>65</volume><issue>6</issue><fpage>386</fpage><lpage>408</lpage><pub-id pub-id-type="doi">10.1037/h0042519</pub-id><pub-id pub-id-type="medline">13602029</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hochreiter</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schmidhuber</surname><given-names>J</given-names> </name></person-group><article-title>Long short-term memory</article-title><source>Neural Comput</source><year>1997</year><month>11</month><day>15</day><volume>9</volume><issue>8</issue><fpage>1735</fpage><lpage>1780</lpage><pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id><pub-id pub-id-type="medline">9377276</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tjong Kim Sang</surname><given-names>EF</given-names> </name><name name-style="western"><surname>De Meulder</surname><given-names>F</given-names> </name></person-group><article-title>Introduction to the CoNLL-2003 shared task: language-independent named entity recognition</article-title><conf-name>Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL</conf-name><conf-date>May 31, 2003</conf-date><conf-loc>Edmonton, Canada</conf-loc><pub-id pub-id-type="doi">10.3115/1119176.1119195</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Masanz</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Ogren</surname><given-names>PV</given-names> </name><etal/></person-group><article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title><source>J Am Med Inform Assoc</source><year>2010</year><volume>17</volume><issue>5</issue><fpage>507</fpage><lpage>513</lpage><pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id><pub-id pub-id-type="medline">20819853</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>von Oswald</surname><given-names>J</given-names> </name><name name-style="western"><surname>Niklasson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Randazzo</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Transformers learn in-context by gradient descent</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 15, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2212.07677</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hendel</surname><given-names>R</given-names> </name><name name-style="western"><surname>Geva</surname><given-names>M</given-names> </name><name name-style="western"><surname>Globerson</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Bouamor</surname><given-names>H</given-names> </name><name name-style="western"><surname>Pino</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bali</surname><given-names>K</given-names> </name></person-group><article-title>In-context learning creates task vectors</article-title><source>Findings of the Association for Computational Linguistics: EMNLP 2023</source><year>2023</year><publisher-name>Association for Computational Linguistics</publisher-name><fpage>9318</fpage><lpage>9333</lpage><pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.624</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Hurst</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lerer</surname><given-names>A</given-names> </name><etal/></person-group><article-title>GPT-4o system card</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.21276</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>PDCMFinder/prompt-llm</article-title><source>GitHub</source><access-date>2024-12-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/PDCMFinder/prompt-llm">https://github.com/PDCMFinder/prompt-llm</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompts used in direct prompting experiments and detailed error analysis.</p><media xlink:href="bioinform_v6i1e70706_app1.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material></app-group></back></article>