<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Bioinform Biotech</journal-id><journal-id journal-id-type="publisher-id">bioinform</journal-id><journal-id journal-id-type="index">19</journal-id><journal-title>JMIR Bioinformatics and Biotechnology</journal-title><abbrev-journal-title>JMIR Bioinform Biotech</abbrev-journal-title><issn pub-type="epub">2563-3570</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v6i1e67801</article-id><article-id pub-id-type="doi">10.2196/67801</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Systemic Anticancer Therapy Timelines Extraction From Electronic Medical Records Text: Algorithm Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Yao</surname><given-names>Jiarui</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Goldner</surname><given-names>Eli</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hochheiser</surname><given-names>Harry</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Finan</surname><given-names>Sean</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Levander</surname><given-names>John</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Harris</surname><given-names>David</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Groen</surname><given-names>Piet C de</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Buchbinder</surname><given-names>Elizabeth</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bitterman</surname><given-names>Danielle</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Warner</surname><given-names>Jeremy L</given-names></name><degrees>MD, MS</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Savova</surname><given-names>Guergana</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Computational Health Informatics Program, Boston Children&#x2019;s Hospital, Harvard Medical School</institution><addr-line>401 Park Drive</addr-line><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Biomedical Informatics, University of Pittsburgh</institution><addr-line>Pittsburgh</addr-line><addr-line>PA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Department of Medicine, Division of Gastroenterology, Hepatology and Nutrition, University of Minnesota</institution><addr-line>Minneapolis</addr-line><addr-line>MN</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Medical Oncology, Dana Farber Cancer Institute</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff5"><institution>Artificial Intelligence in Medicine (AIM) Program, Mass General Brigham, Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff6"><institution>Department of Radiation Oncology, Brigham and Women&#x2019;s Hospital/Dana-Farber Cancer Institute</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><aff id="aff7"><institution>Center for Clinical Cancer Informatics and Data Science, Legorreta Cancer Center, Brown University</institution><addr-line>Providence</addr-line><addr-line>RI</addr-line><country>United States</country></aff><aff id="aff8"><institution>Brown University Health Cancer Institute, Rhode Island Hospital</institution><addr-line>Providence</addr-line><addr-line>RI</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Finkelstein</surname><given-names>Joseph</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Gaurav Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Markose</surname><given-names>Ginoop Chennekkattu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhou</surname><given-names>Huixue</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wu</surname><given-names>Jinge</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jiarui Yao, PhD, Computational Health Informatics Program, Boston Children&#x2019;s Hospital, Harvard Medical School, 401 Park Drive, Boston, MA, 02115, United States, 1 7813545014; <email>jiarui.yao@childrens.harvard.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>3</day><month>9</month><year>2025</year></pub-date><volume>6</volume><elocation-id>e67801</elocation-id><history><date date-type="received"><day>22</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>22</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>07</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jiarui Yao, Eli Goldner, Harry Hochheiser, Sean Finan, John Levander, David Harris, Piet C de Groen, Elizabeth Buchbinder, Danielle Bitterman, Jeremy L Warner, Guergana Savova. Originally published in JMIR Bioinformatics and Biotechnology (<ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org">https://bioinform.jmir.org</ext-link>), 3.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">http://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Bioinformatics and Biotechnology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://bioinform.jmir.org/">https://bioinform.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://bioinform.jmir.org/2025/1/e67801"/><abstract><sec><title>Background</title><p>The systemic treatment of cancer typically requires the use of multiple anticancer agents in combination or sequentially. Clinical narrative texts often contain extensive descriptions of the temporal sequencing of systemic anticancer therapy (SACT), setting up an important task that may be amenable to automated extraction of SACT timelines.</p></sec><sec><title>Objective</title><p>We aimed to explore automatic methods for extracting patient-level SACT timelines from clinical narratives in the electronic medical records (EMRs).</p></sec><sec sec-type="methods"><title>Methods</title><p>We used two datasets from two institutions: (1) a colorectal cancer (CRC) dataset including the entire EMR of the 199 patients in the THYME (Temporal Histories of Your Medical Event) dataset and (2) the 2024 ChemoTimelines shared task dataset including 149 patients with ovarian cancer, breast cancer, and melanoma. We explored finetuning smaller language models trained to attend to events and time expressions, and few-shot prompting of large language models (LLMs). Evaluation used the 2024 ChemoTimelines shared task configuration&#x2014;Subtask1 involving the construction of SACT timelines from manually annotated SACT event and time expression mentions provided as input in addition to the patient&#x2019;s notes and Subtask2 requiring extraction of SACT timelines directly from the patient&#x2019;s notes.</p></sec><sec sec-type="results"><title>Results</title><p>Our task-specific finetuned EntityBERT model achieved 93% <italic>F</italic><sub>1</sub>-score, outperforming the best results in Subtask1 of the 2024 ChemoTimelines shared task (90%). It ranked second in Subtask2. LLM (LLaMA2, LLaMA3.1, and Mixtral) performance lagged the task-specific finetuned model performance for both the THYME and shared task datasets. On the shared task datasets, the best LLM performance was 77% macro <italic>F</italic><sub>1</sub>-score, 16% points lower than the task-specific finetuned system (Subtask1).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In this paper, we explored approaches for patient-level timeline extraction through the SACT timeline extraction task. Our results and analysis add to the knowledge of extracting treatment timelines from EMR clinical narratives using language modeling methods.</p></sec></abstract><kwd-group><kwd>systemic anticancer therapy</kwd><kwd>electronic medical records</kwd><kwd>treatment timelines extraction</kwd><kwd>natural language processing</kwd><kwd>large language models</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The systemic treatment of cancer typically requires the use of multiple anticancer agents in combination or sequentially. Systemic anticancer therapy (SACT), which includes traditional cytotoxic chemotherapy, endocrine therapy, targeted therapy, and immunotherapy, has both a low therapeutic index as well as synergistic potential when agents are given in combination. Due to cumulative toxicities, the order in which SACT components are received is much more important than only whether individual drug exposures happened or not, whether in the curative or noncurative setting. Furthermore, patients may receive an extended sequence of treatments across multiple health care settings, systems, and insurance arrangements, making an accurate tally of the totality of treatment using standard structured data resources extremely challenging if not impossible. Meanwhile, clinical narrative texts often contain extensive descriptions of the temporal sequencing of SACT, setting up an important task that may be amenable to automated extraction approaches.</p><p>Clinical natural language processing (NLP) is a field that builds computational methods to enable machines to process clinical narratives. Temporality has been a key research area within clinical NLP as it has a wide range of applications including temporal sequencing of SACT [<xref ref-type="bibr" rid="ref1">1</xref>]. The focus of temporality extraction in clinical NLP has been mainly on instance-level pairwise temporal relation extraction from electronic medical records (EMRs). Instance-level pairwise temporal relations (TLINKs) are the links between an event (EVENT) mention and a temporal expression (TIMEX3) mention or between two event mentions, constituting a triple of the TLINK and the other two components. The set of TLINKs values, that is, type of temporal relations, is CONTAINS, BEFORE, OVERLAP, BEGINS-ON, ENDS-ON, and NOTED-ON [<xref ref-type="bibr" rid="ref1">1</xref>]. The event that CONTAINS another event is referred to as a narrative container (CONTAINS-1 is the reverse of CONTAINS, meaning an EVENT is contained by the narrative container). In addition, each EVENT has a temporal relation with the document creation time (DocTimeRel), one of BEFORE, BEFORE-OVERLAP, OVERLAP, or AFTER.</p><p>The construction of benchmarks, such as THYME (Temporal Histories of Your Medical Event) and i2b2 [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], along with the SemEval shared tasks [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>] on temporality advanced the methodologies and established the state-of-the-art (SOTA) for the task [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. The sophisticated SOTA methods for temporal relation extraction open the door for exploring automatic patient-level timeline construction.</p><p>The 2024 ChemoTimelines shared task [<xref ref-type="bibr" rid="ref13">13</xref>] formulated SACT timeline construction as an information extraction task and provided the deidentified free text documents (except for dates) from the EMRs of 57,520 (breast and ovarian cancer) and 15,946 (melanoma) patients from University of Pittsburgh Medical Center. The documents represented a wide variety of notes, for example, pathology reports, clinical notes, radiology reports, emergency department visits, discharge summaries, etc. A subset of 149 patients was expert-annotated for EVENT mentions, TIMEX3 mentions, and instance-level pairwise temporal relations following the THYME2 schema [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] and patient-level timelines of SACT events. The shared task offered 2 subtasks. &#x201C;Subtask1&#x201D; involved creating timelines from gold EVENTS and TIMEX3 mentions. &#x201C;Subtask2&#x201D; challenged the participants to build end-to-end systems that extracted patient-level SACT timelines directly from the free texts. In this work, &#x201C;end-to-end&#x201D; means all text processing is done automatically. <xref ref-type="fig" rid="figure1">Figure 1</xref> summarizes the 2 subtasks. Various approaches were explored by the shared task participants&#x2014;from supervised finetuning [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] to LLM prompting [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. The impressive results (<italic>F</italic><sub>1</sub>-score=90 for Subtask1 and <italic>F</italic><sub>1</sub>-score=70 for Subtask2) achieved by the systems from top participants [<xref ref-type="bibr" rid="ref15">15</xref>] demonstrated the usability and effectiveness of NLP models for this task. The top systems implemented task-specific finetuning of smaller pretrained language models (LMs). Specifically, the LAILab system [<xref ref-type="bibr" rid="ref15">15</xref>] cast the task as a sequence-to-sequence task, and finetuned Flan-T5-XXL [<xref ref-type="bibr" rid="ref19">19</xref>] and BART-large [<xref ref-type="bibr" rid="ref20">20</xref>]. It achieved the best results in the shared task for both subtasks. The Wonder system [<xref ref-type="bibr" rid="ref16">16</xref>] generated synthetic data using GPT-4 for data augmentation, then finetuned BioLM [<xref ref-type="bibr" rid="ref21">21</xref>]. The baseline system offered by the organizers [<xref ref-type="bibr" rid="ref13">13</xref>] also took the supervised finetuning approach with PubMedBERT [<xref ref-type="bibr" rid="ref22">22</xref>] and secured the second place in both subtasks. In the rest of the paper, for simplicity, we refer to the 2024 ChemoTimelines shared task as the shared task.</p><p>In this paper, we further researched SACT timeline extraction using the shared task dataset and adding the dataset of another frequent type of cancer (such as CRC) from another academic medical center. We explored task-specific finetuning approaches and LLM prompting [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref29">29</xref>] to extract SACT timelines. We compared our results on the breast, ovarian, and melanoma datasets from the shared task to the results of the shared task participants. We achieved a new SOTA in Subtask1. We established the SOTA for the CRC dataset as this is a new dataset. Our LLM-based system investigations add to the research of using LLMs for end-to-end SACT treatment timeline extraction from clinical narratives, as only one team explored end-to-end timeline extraction using LLMs in the shared task.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Summary of the 2024 ChemoTimelines shared task. TIMEX3: time expressions; CONTAINS-1: reverse of CONTAINS, meaning &#x201C;chemotherapy&#x201D; is contained by &#x201C;last Thursday&#x201D;; DocTime: document creation time.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e67801_fig01.png"/></fig><p>The contributions of this paper are as follows.</p><p>First, the approaches for patient-level timeline extraction through the task of SACT timeline extraction. We perform experiments on the 2024 ChemoTimelines shared task as well as on the THYME CRC patients. Our results and analysis on this task add to the knowledge of extracting treatment timelines from EMRs using LLM-based methods.</p><p>Second, the SOTA performance of our finetuned LM-based system for Subtask1 of the 2024 ChemoTimelines shared task.</p><p>Third, SOTA performance with LLM prompting approaches for Subtask1 and Subtask2 of the 2024 ChemoTimelines shared task outperformed the shared task participant systems that took the approach of prompting LLMs.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>All electronic health record (EHR) data used in this study are deidentified in accordance with the datasets&#x2019; relevant privacy regulations [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. We strictly adhered to the terms outlined in the data use agreement, ensuring that no data were transmitted to any external or public APIs. Ethics approval was not required because the study used secondary data that was aggregated and anonymized before analysis. All experiments were conducted on a secure local machine operating behind a firewall, maintaining full data confidentiality and integrity throughout the study.</p></sec><sec id="s2-2"><title>Tasks and Datasets</title><p>The first dataset we used was from the shared task [<xref ref-type="bibr" rid="ref13">13</xref>]. The EMR notes of 149 patients with breast, ovarian, and melanoma cancers from the University of Pittsburgh Medical Center were expert-annotated by the shared task organizers for instance-level pairwise temporal relations following the THYME2 schema [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] and SACT patient-level timelines.</p><p>The second dataset we used included the THYME patients&#x2014;199 CRC patients from Mayo Clinic. This dataset was NOT part of the 2024 ChemoTimelines shared task. Note that the original THYME corpus consisted of one radiology, one pathology, and one oncology note for each of the 199 CRC patients&#x2014;not sufficient to extract SACT timelines. Therefore, for the work described in this paper, we obtained the entire EMR documentation for these 199 CRC patients (all manually deidentified except for dates). As with the shared task patients, the CRC patient EMRs were represented by a wide variety of document types. Following the shared task protocol, the CRC notes were expert-annotated for instance-level pairwise temporal relations following the THYME2 schema and SACT patient-level timelines. <xref ref-type="table" rid="table1">Table 1</xref> shows the dataset distributions. Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides the pairwise label distributions. The label set for the pairwise relations is CONTAINS, BEGINS-ON, ENDS-ON, OVERLAP, and BEFORE. In the final SACT timeline, we converted CONTAINS to CONTAINS-1 so that all triples are structured as &#x003C;EVENT, TLINK, TIMEX3&#x003E;, where CONTAINS-1 semantically indicates that the drug was administered on the date specified by the temporal expression (TIMEX3). Note that we did not use i2b2 2012 because we focused on cancer treatment timeline extraction only in this work. <xref ref-type="other" rid="box1">Textbox 1</xref> presents a concrete example of patient-level SACT timelines.</p><p>As is the established convention, in this paper, we refer to the labels in the shared task and THYME datasets as &#x201C;gold.&#x201D; All datasets come with predefined training (train), development (dev), and test splits that we used accordingly. Note that the gold labels of the shared task test set were not publicly available; however, participants could submit their system predictions to the shared task organizers to get evaluation results, thus providing independent evaluation over a held-out, eyes-off dataset.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Dataset summary.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Splits</td><td align="left" valign="top">Patients</td><td align="left" valign="top">Notes</td><td align="left" valign="top">Words<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">EVENT mentions</td><td align="left" valign="top">TIMEX3<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> mentions</td><td align="left" valign="top">TLINKs<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7">Ovarian cancer (from 2024 ChemoTimelines shared task)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train</td><td align="left" valign="top">26</td><td align="left" valign="top">1675</td><td align="left" valign="top">1,183,632</td><td align="left" valign="top">1168</td><td align="left" valign="top">597</td><td align="left" valign="top">494</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dev<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">8</td><td align="left" valign="top">562</td><td align="left" valign="top">308,814</td><td align="left" valign="top">790</td><td align="left" valign="top">312</td><td align="left" valign="top">226</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test</td><td align="left" valign="top">8</td><td align="left" valign="top">559</td><td align="left" valign="top">257,116</td><td align="left" valign="top">664</td><td align="left" valign="top">381</td><td align="left" valign="top">Not released<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td></tr><tr><td align="left" valign="top" colspan="7">Breast cancer (from 2024 ChemoTimelines shared task)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train</td><td align="left" valign="top">33</td><td align="left" valign="top">1002</td><td align="left" valign="top">465,644</td><td align="left" valign="top">1023</td><td align="left" valign="top">576</td><td align="left" valign="top">455</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dev</td><td align="left" valign="top">16</td><td align="left" valign="top">499</td><td align="left" valign="top">225,588</td><td align="left" valign="top">279</td><td align="left" valign="top">146</td><td align="left" valign="top">113</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test</td><td align="left" valign="top">35</td><td align="left" valign="top">1333</td><td align="left" valign="top">786,896</td><td align="left" valign="top">2560</td><td align="left" valign="top">1118</td><td align="left" valign="top">Not released</td></tr><tr><td align="left" valign="top" colspan="7">Melanoma (from 2024 ChemoTimelines shared task)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train</td><td align="left" valign="top">10</td><td align="left" valign="top">233</td><td align="left" valign="top">124,924</td><td align="left" valign="top">147</td><td align="left" valign="top">78</td><td align="left" valign="top">48</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dev</td><td align="left" valign="top">3</td><td align="left" valign="top">211</td><td align="left" valign="top">178,308</td><td align="left" valign="top">789</td><td align="left" valign="top">261</td><td align="left" valign="top">201</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test</td><td align="left" valign="top">10</td><td align="left" valign="top">229</td><td align="left" valign="top">156,083</td><td align="left" valign="top">398</td><td align="left" valign="top">193</td><td align="left" valign="top">Not released</td></tr><tr><td align="left" valign="top" colspan="7">Colorectal cancer (CRC)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Train</td><td align="left" valign="top">98</td><td align="left" valign="top">12,990</td><td align="left" valign="top">6,038,431</td><td align="left" valign="top">11,161</td><td align="left" valign="top">6155</td><td align="left" valign="top">5897</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Dev</td><td align="left" valign="top">50</td><td align="left" valign="top">6810</td><td align="left" valign="top">3,105,675</td><td align="left" valign="top">3964</td><td align="left" valign="top">2194</td><td align="left" valign="top">1924</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test</td><td align="left" valign="top">51</td><td align="left" valign="top">7357</td><td align="left" valign="top">3,587,387</td><td align="left" valign="top">7552</td><td align="left" valign="top">3612</td><td align="left" valign="top">4403</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>&#x201C;Words&#x201D; denotes tokens delimited by white spaces.</p></fn><fn id="table1fn2"><p><sup>b</sup>TIMEX3: time expressions.</p></fn><fn id="table1fn3"><p><sup>c</sup>TLINKs: pairwise temporal relations.</p></fn><fn id="table1fn4"><p><sup>d</sup>Dev: development set.</p></fn><fn id="table1fn5"><p><sup>e</sup>Note that the number of test set TLINKs for the 2024 ChemoTimelines shared task was not released publicly.</p></fn></table-wrap-foot></table-wrap><boxed-text id="box1"><title> An example of a summarized patient-level SACT timeline extracted from the entire patient&#x2019;s EMR chart.</title><list list-type="bullet"><list-item><p>['chemotherapy', 'contains-1', '2013-06-20']</p></list-item><list-item><p>['carboplatin', 'contains-1', '2013-10-24']</p></list-item><list-item><p>['carboplatin', 'contains-1', '2013-09-19']</p></list-item><list-item><p>['carboplatin', 'contains-1', '2013-07-18']</p></list-item><list-item><p>['carboplatin', 'contains-1', '2013-08-08']</p></list-item><list-item><p>['carboplatin', 'contains-1', '2013-08-29']</p></list-item><list-item><p>['taxol', 'contains-1', '2013-10-24']</p></list-item><list-item><p>['taxol', 'contains-1', '2013-09-19']</p></list-item><list-item><p>['taxol', 'contains-1', '2013-07-18']</p></list-item><list-item><p>['taxol', 'contains-1', '2013-08-08']</p></list-item><list-item><p>['taxol', 'contains-1', '2013-08-29']</p></list-item></list></boxed-text></sec><sec id="s2-3"><title>Approaches</title><p>We explored 2 approaches for the task of SACT timelines extraction: (1) finetuning smaller LMs and (2) prompting LLMs. <xref ref-type="fig" rid="figure2">Figure 2</xref> shows the complete pipeline of both approaches. We describe each approach in detail in this section.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Methods summary. On the left-hand side, temporal relations are classified via a small finetuned language model (FT LM). On the right-hand side, temporal relation triplets are extracted by prompting large language models (LLMs). In both approaches, EVENTS are extracted using a <bold>B</bold>egin-<bold>I</bold>nside-<bold>O</bold>utside (BIO) tagger. Output for both systems is the same, see Textbox 1. cTAKES: Apache Clinical Text Analysis and Knowledge Extraction System; TIMEX3: time expressions; TLINK: pairwise temporal relation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e67801_fig02.png"/></fig><sec id="s2-3-1"><title>Approach 1: Finetuning LMs for Temporal Relation Extraction</title><sec id="s2-3-1-1"><title>Overview</title><p>In this approach, we cast the task of SACT timeline extraction as a pairwise temporal relation extraction task followed by a temporal relation summarization step. Given input texts, we designed a pipeline with the following steps: (1) extracting SACT EVENT mentions, (2) extracting TIMEX3 mentions, (3) classifying pairwise EVENT-TIMEX3 temporal relations, (4) normalizing TIMEX3 mentions, and (5) summarizing and refining patient-level timelines.</p></sec><sec id="s2-3-1-2"><title>Extracting SACT EVENT Mentions</title><p>We trained a sequence labeling tagger that marks the beginning, inside, and outside (BIO) of a SACT treatment EVENT mention in the text. The tagger was trained on the train split of the gold labeled data by finetuning a pretrained LM [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. The &#x201C;Experimental Settings&#x201D; section shows more details.</p></sec><sec id="s2-3-1-3"><title>Extracting TIMEX3 Mentions</title><p>TIMEX3 mentions were extracted by the temporal module of the Apache Clinical Text Analysis and Knowledge Extraction System (cTAKES) [<xref ref-type="bibr" rid="ref31">31</xref>], a publicly available text processing system. The precision, recall, and <italic>F</italic><sub>1</sub>-scores of cTAKES for TIMEX3 mention extraction are 57.17%, 83.95%, and 67.25%, respectively; evaluated on the original THYME dataset described in the &#x201C;Tasks and Datasets&#x201D; subsection. Different methodologies were used for SACT EVENT mention extraction and TIMEX3 mention extraction because there was no publicly available SACT EVENT extractor with solid performance at the time of the experiments.</p></sec><sec id="s2-3-1-4"><title>Classifying Pairwise EVENT-TIMEX3 Temporal Relations</title><p>Given an EVENT-TIMEX3 pair, the task is to determine the temporal relation between them according to a predefined label set of TLINKs (described in the &#x201C;Introduction&#x201D; and &#x201C;Tasks and Datasets&#x201D; sections). For example, if the patient started a regimen of Taxol on August 1, 2012, the relation between &#x201C;Taxol&#x201D; and &#x201C;August 1, 2012&#x201D; is BEGINS-ON. Inspired by previous works [<xref ref-type="bibr" rid="ref11">11</xref>], we finetuned EntityBERT for this step to create an LM specifically trained to attend to EVENT and TIMEX3 mentions. The input to the model was the EVENT and TIMEX3 mentions within a context window with the EVENT and TIMEX3 mentions highlighted by special tokens, possibly crossing sentence boundaries. We followed the same data preprocessing format as described in [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Concretely, EVENT and TIMEX mentions are highlighted by XML tags &#x201C;&#x003C;e&#x003E;,&#x201D; &#x201C;&#x003C;/e&#x003E;,&#x201D; &#x201C;&#x003C;t&#x003E;,&#x201D; and &#x201C;&#x003C;/t&#x003E;.&#x201D; The context window that defines the token distances between an EVENT and TIMEX3 in an EVENT-TIMEX3 pair is set to 60 tokens, empirically derived to cover over 95% of the EVENT-TIMEX3 pair instances. The model was trained on the train split of the gold-labeled data for multiclass classification.</p></sec><sec id="s2-3-1-5"><title>Normalizing TIMEX3 Mentions</title><p>The goal of this step is to map TIMEX3 mentions to a computable format. We used TimeNorm [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>] to normalize the TIMEX3 mentions and the document creation time (DocTime) to ISO-TimeML standard [<xref ref-type="bibr" rid="ref34">34</xref>] (eg, &#x201C;yesterday&#x201D; in a note with a DocTime of &#x201C;2022-04-29&#x201D; would be normalized to &#x201C;2022-04-28&#x201D;).</p></sec><sec id="s2-3-1-6"><title>Summarizing and Refining Patient-Level Timelines</title><p>A patient&#x2019;s SACT history can be mentioned in multiple notes in different contexts. For example, the physician may discuss the termination of one treatment due to side effects; despite that, in another note, they may say that the therapy will be given to the patient for 3 more cycles. Therefore, after the instance-level temporal relation extraction, deduplication and conflict resolution are necessary to get the final patient-level SACT timelines. For this step, we followed the heuristics from the shared task [<xref ref-type="bibr" rid="ref13">13</xref>].</p></sec></sec></sec><sec id="s2-4"><title>Approach 2: Prompting LLMs for SACT Timeline Extraction</title><sec id="s2-4-1"><title>Overview</title><p>We developed an end-to-end timeline extraction pipeline via LLM prompting. This pipeline involved two steps: Step 1 focused on extracting &#x003C;EVENT, TLINK, TIMEX3&#x003E; triplets from clinical texts, and Step 2 was designed for TIMEX3 normalization. We took the approach of in-context learning, which refers to the method of adding exemplars of gold examples with answers to the prompt [<xref ref-type="bibr" rid="ref25">25</xref>], a common practice in prompt engineering. <xref ref-type="other" rid="box2">Textbox 2</xref> provides the prompt templates we used in both steps. For Step 1, we provide 4 exemplars for each TLINK label. For Step 2, we provide 5 exemplars in total. The exemplars are selected from the training split of the data. We explored the discrete prompting strategy where the prompts are created manually, ultimately settling on the prompts with the best performance.</p><boxed-text id="box2"><title> Prompt templates used in our large language model (LLM) experiments. For Step 1, we provide 4 exemplars for each label. For Step 2, we provide 5 exemplars in total.</title><list list-type="bullet"><list-item><p>Step 1 prompt: You are a helpful assistant for oncologists. You will read the given PATIENT EHR and summarize the patient's chemotherapy treatment TIMELINES. Please only output TIMELINES in the requested format. Please do not include any other text or reasoning, do not include timelines for any other treatments besides chemotherapy. Please do not use any labels other than the ones given in the examples, i.e., BEGINS-ON, ENDS-ON, CONTAINS. Here are some examples.</p></list-item><list-item><p>Step 2 prompt: You are asked to decide the date of a time expression. If today was 2013-05-02, what would the date of yesterday be? Please only output the date in the format of &#x201C;YYYY-MM-DD&#x201D;. Answer &#x201C;Unknown&#x201D; if you don't know. Here are some examples.</p></list-item></list></boxed-text></sec><sec id="s2-4-2"><title>Step 1: Extracting &#x003C; EVENT, TLINK, TIMEX3&#x003E; Triplets</title><p>The construction of patient-level treatment timelines requires the system to process all notes of a patient, thus the input can exceed the LLM context window. Current open LLMs have a limited number of tokens they can process per time, for example, LLaMA1 [<xref ref-type="bibr" rid="ref35">35</xref>] supports up to 2048 tokens and LLaMA2 [<xref ref-type="bibr" rid="ref23">23</xref>] supports up to 4096 tokens; however, even if the LLM could ingest all the notes of one patient as input per time, which would not be an efficient way of processing texts as transformers&#x2019; self-attention scales quadratically with input length. Therefore, sending all the notes of a patient to LLMs at one time is not practical. To make this task more feasible for LLMs, we prompted the LLM with only relevant snippets of notes and assembled the timelines afterwards. Specifically, we extracted SACT EVENT mentions using the BIO tagger trained in Approach 1, then fed the LLM the sentences containing the SACT EVENT mentions to extract the triplets. Note, the input to the LLM was a sentence, unlike the context window instances fed to the pairwise classifier in Approach 1. In our initial experiments, we used context window instances with the LLMs as well; however, the partial sentences confused them as tokens outside of the window are discarded. To give LLMs a self-contained input with a reasonable sequence length, we decided to give a complete sentence as input for LLMs instead of a context window as we did in Approach 1.</p></sec><sec id="s2-4-3"><title>Step 2: TIMEX3 Normalization With LLMs</title><p>We applied in-context learning to normalize the TIMEX3 mentions. For each output triplet from Step 1, we prompted the model to normalize the date of the TIMEX3 mention given the DocTime of the note. We then assembled the final timelines, using the same heuristics as in Approach 1.</p></sec></sec><sec id="s2-5"><title>Experimental Settings</title><p>We explored two approaches for the task of SACT timelines extraction: (1) finetuning smaller LMs and (2) prompting LLMs. For the first approach, we finetuned PubMedBERT base model [<xref ref-type="bibr" rid="ref22">22</xref>] to train the SACT event tagger. For the temporal relation classification task, we finetuned EntityBERT based on the results reported by Lin et al [<xref ref-type="bibr" rid="ref11">11</xref>], where they finetuned BioBERT, PubMedBERT, and EntityBERT for clinical temporal relation classification and found that EntityBERT outperformed the other two models. For the experiments with LLMs, we chose LLaMA2-70B [<xref ref-type="bibr" rid="ref23">23</xref>], LLaMA3.1-70B [<xref ref-type="bibr" rid="ref36">36</xref>], and Mixtral-8&#x00D7;7B-Instruct-v1 [<xref ref-type="bibr" rid="ref24">24</xref>], which are current SOTA open LLMs. We did not use proprietary LLMs such as GPT4 [<xref ref-type="bibr" rid="ref26">26</xref>] because we did not have access to their Health Insurance Portability and Accountability Act (HIPAA)-compliant versions. The open models we experimented with are reported to have yielded results competitive with those of the proprietary models [<xref ref-type="bibr" rid="ref24">24</xref>]. Furthermore, we compare our results with those systems in the shared task for the types of cancers included in the shared task. For the CRC dataset (not included in the shared task), we establish the first result that will serve as the baseline for the community. See Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for details on the computational settings.</p><p>We experimented with prompting LLMs for both Subtask1 and Subtask2. In Subtask1, we provided explicit gold SACT events and time expressions in the text, then prompted the LLM to predict the temporal relation between them. The prompt template for this subtask is shown in Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. In Subtask2, we passed to the LLM only plain text as input, then asked the LLM to extract the SACT events, time expressions, and temporal relation between them in 1 step. <xref ref-type="other" rid="box2">Textbox 2</xref> lists the prompt template for Subtask2.</p></sec><sec id="s2-6"><title>Evaluation and Baseline</title><p>We used the evaluation metric provided by the shared task, which computed the average <italic>F</italic><sub>1</sub>-scores across all patients. There were 4 settings with different temporal granularities: strict, relaxed-to-day, relaxed-to-month, and relaxed-to-year. For example, the relaxed-to-month setting required the model to correctly predict the year and month when the therapy was given, while the strict setting required the model to capture the exact date when the patient received the therapy. The official metric for the 2024 shared task was relaxed-to-month scores, which we used as our metric to report the main results in this paper. Results using other metrics are given in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>As a baseline, we used the baseline system used in the shared task, which implemented a predefined dictionary as a lookup table for SACT EVENT extraction and a finetuned LM for temporal relation classification. We also compared our results on the 3 types of cancer (breast cancer, ovarian cancer, and melanoma) to the shared task leaderboard results.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>In <xref ref-type="table" rid="table2">Table 2</xref>, we present our results on the development (Dev) and test sets. As the CRC dataset was not available for the shared task, we also present the results of our model finetuned only on the shared task data (under EntityBERT 3 Cr) for a direct comparison with other participating systems. That is, using Approach 1 described above, we trained 2 versions of the model. &#x201C;EntityBERT&#x201D; was trained on the shared task data and CRC data. &#x201C;EntityBERT 3 Cr&#x201D; was trained only on the shared task data (we combined the training datasets of multiple cancer types into 1 training dataset to train the EntityBERT 3 Cr model and EntityBERT model). Subtask1 in <xref ref-type="table" rid="table2">Table 2</xref> shows the results with gold SACT EVENT and TIMEX3 mentions as input. In general, the finetuned EntityBERT and EntityBERT (3 Cr) outperformed LLaMA2, LLaMA3.1, and Mixtral LLMs by a large margin. Among the LLMs, LLaMA achieved higher scores than Mixtral. In <xref ref-type="table" rid="table2">Table 2</xref>, Subtask2 shows the end-to-end evaluation results. The SACT event extraction evaluation results using the BIO tagger can be found in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We note a wide gap between the performance with gold mention input (Subtask1) and the performance with automatically extracted mentions (Subtask2), suggesting that the errors in the mention extraction stage propagate to the relation extraction stage and dramatically affect the overall accuracy of the system. We also notice that the smaller finetuned models outperform LLMs in most cases except for melanoma, the reasons for which we discuss in the Discussion section.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Evaluation results of our systems across 4 types of cancers from 2 academic centers. Scores are macro <italic>F</italic><sub>1</sub>-score, relaxed-to-month.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Cancer type and models</td><td align="left" valign="bottom" colspan="2">Subtask1<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, %</td><td align="left" valign="bottom" colspan="2">Subtask2<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>, %</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Development set</td><td align="left" valign="bottom">Test set</td><td align="left" valign="bottom">Development set</td><td align="left" valign="bottom">Test set</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Ovarian cancer</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" rowspan="5"/><td align="left" valign="top">EntityBERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">93<sup>e</sup></td><td align="left" valign="top">95<sup>e</sup></td><td align="left" valign="top">64</td><td align="left" valign="top">61</td></tr><tr><td align="left" valign="top">EntityBERT (3 Cr)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup>,<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">93<sup>e</sup></td><td align="left" valign="top">94</td><td align="left" valign="top">67<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">69<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">LLaMA2<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">70</td><td align="left" valign="top">70</td><td align="left" valign="top">29</td><td align="left" valign="top">42</td></tr><tr><td align="left" valign="top">LLaMA3.1<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">75</td><td align="left" valign="top">74</td><td align="left" valign="top">31</td><td align="left" valign="top">56</td></tr><tr><td align="left" valign="top">Mixtral<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">60</td><td align="left" valign="top">67</td><td align="left" valign="top">7</td><td align="left" valign="top">27</td></tr><tr><td align="left" valign="top" colspan="2">Breast cancer</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" rowspan="5"/><td align="left" valign="top">EntityBERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">97<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">97</td><td align="left" valign="top">88<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">63</td></tr><tr><td align="left" valign="top">EntityBERT (3 Cr)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">97<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">98<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">87</td><td align="left" valign="top">66<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">LLaMA2</td><td align="left" valign="top">81</td><td align="left" valign="top">83</td><td align="left" valign="top">61</td><td align="left" valign="top">50</td></tr><tr><td align="left" valign="top">LLaMA3.1</td><td align="left" valign="top">79</td><td align="left" valign="top">70</td><td align="left" valign="top">66</td><td align="left" valign="top">48</td></tr><tr><td align="left" valign="top">Mixtral</td><td align="left" valign="top">66</td><td align="left" valign="top">63</td><td align="left" valign="top">37</td><td align="left" valign="top">25</td></tr><tr><td align="left" valign="top" colspan="2">Melanoma</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" rowspan="5"/><td align="left" valign="top">EntityBERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">86<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">91<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">43</td><td align="left" valign="top">39</td></tr><tr><td align="left" valign="top">EntityBERT (3 Cr)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">86<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">88</td><td align="left" valign="top">46</td><td align="left" valign="top">40</td></tr><tr><td align="left" valign="top">LLaMA2</td><td align="left" valign="top">80</td><td align="left" valign="top">79</td><td align="left" valign="top">47<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">47<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">LLaMA3.1</td><td align="left" valign="top">67</td><td align="left" valign="top">71</td><td align="left" valign="top">26</td><td align="left" valign="top">38</td></tr><tr><td align="left" valign="top">Mixtral</td><td align="left" valign="top">65</td><td align="left" valign="top">65</td><td align="left" valign="top">4</td><td align="left" valign="top">25</td></tr><tr><td align="left" valign="top" colspan="2">Colorectal cancer (CRC)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" rowspan="4"/><td align="left" valign="top">EntityBERT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">90<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">83<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">58<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">56<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">LLaMA2</td><td align="left" valign="top">66</td><td align="left" valign="top">77</td><td align="left" valign="top">40</td><td align="left" valign="top">32</td></tr><tr><td align="left" valign="top">LLaMA3.1</td><td align="left" valign="top">66</td><td align="left" valign="top">68</td><td align="left" valign="top">45</td><td align="left" valign="top">38</td></tr><tr><td align="left" valign="top">Mixtral</td><td align="left" valign="top">58</td><td align="left" valign="top">66</td><td align="left" valign="top">19</td><td align="left" valign="top">15</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Subtask1: input is gold entities (systemic anticancer therapy [SACT] events and time expressions).</p></fn><fn id="table2fn2"><p><sup>b</sup>Subtask2: entities are automatically generated by the system.</p></fn><fn id="table2fn3"><p><sup>c</sup>These are systems using small finetuned models.</p></fn><fn id="table2fn4"><p><sup>d</sup>EntityBERT (3 Cr): EntityBERT model trained only on the shared task data.</p></fn><fn id="table2fn5"><p><sup>e</sup>These are the best results.</p></fn><fn id="table2fn6"><p><sup>f</sup>LLaMA2-70B.</p></fn><fn id="table2fn7"><p><sup>g</sup>LLaMA3.1-70B.</p></fn><fn id="table2fn8"><p><sup>h</sup>Mixtral-8&#x00D7;7B-Instruct-v1.</p></fn></table-wrap-foot></table-wrap><p>Furthermore, unlike the LLM prompting approaches, both our systems based on the smaller finetuned models can be deployed for inference on a laptop without a GPU. Our Subtask1 system is able to process approximately 14 notes/minute. Our Subtask2 system is able to process approximately 10 notes/minute. Assuming a typical patient with 200 notes, our Subtask1 system takes on average 14.5 minutes to process all of the patient&#x2019;s notes, and our Subtask2 system takes on average 20 minutes to process all of the patient&#x2019;s notes. On the other hand, the LLM prompting experiments were conducted on NVIDIA A100 GPUs. It took the LLaMA3.1 70B model approximately 28 minutes for Subtask1 and 13 minutes for Subtask2 to process 200 notes. It took LLMs less time to complete Subtask2 because only sentences containing TIMEX3 mentions needed to be processed in Subtask2.</p><p>We position our systems within the broader context of the 2024 ChemoTimelines shared task by comparing them with the shared task participants&#x2019; systems. If 1 shared task participant has multiple submissions, we take their best result for comparison. Note the official metric for the leader board is relaxed-to-month scores on the Test set. We first compare the result of our EntityBERT (3 Cr) model with the results of the participating systems using similar approaches, that is, finetuning smaller LMs [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. In <xref ref-type="fig" rid="figure3">Figure 3</xref>-Part A, we can see that in Subtask1 our model achieved the best results overall and on the individual cancer types. Our Subtask1 result was 3 points higher than the best shared task score achieved by LAILab [<xref ref-type="bibr" rid="ref15">15</xref>] (93% vs 90%). In Subtask2 (<xref ref-type="fig" rid="figure3">Figure 3</xref>-Part B), our system had the second-best overall scores. However, it is worth noting that LAILab finetuned Flan-T5-XXL [<xref ref-type="bibr" rid="ref19">19</xref>], a model with 11B parameters, which was much bigger than the EntityBERT model we used that had about 100 million parameters.</p><p>Finally, we observe in <xref ref-type="table" rid="table2">Table 2</xref> that the model trained only on the breast, ovarian, and melanoma data from the train split of the shared task (ie, EntityBERT 3 Cr) performed slightly better than its counterpart trained on the full train split containing all 4 types of cancer (ie, EntityBERT) in Subtask2. We conjecture that since there was more data for CRC than the other types of cancer within our dataset, the representation of the signal from the CRC data overwhelmed that of the other three cancer types inside the model. The addition of the second dataset (CRC) in this work aims to create a larger pool of datapoints adding a new type of cancer and a different institution as the data source. It also helps answer the questions of whether (1) a model built off data across different EMR sources might be feasible and (2) the quantity of the data matters. Our experiments on these two datasets show that (1) it is likely that institution-specific models capture treatment patterns better but not by a large margin and (2) patterns of the data-rich source likely dominate.</p><p>In <xref ref-type="fig" rid="figure4">Figure 4</xref> we compare our LLM-based approaches with the shared task systems that prompted LLMs. With gold mentions as input (Subtask1), our system based on prompting LLaMA2 achieved the highest overall score compared to the shared task systems. When using Mixtral as the starting point, our system and the NLPeers [<xref ref-type="bibr" rid="ref18">18</xref>] system achieved similar overall scores (65% vs 64%), which are significantly lower than the overall score of LLaMA2 and LLaMA3.1, suggesting that LLaMA family models are more suitable for this subtask than Mixtral. Only 1 team from the shared task explored end-to-end timeline construction using an LLM. In <xref ref-type="fig" rid="figure4">Figure 4</xref>-Part B, Subtask2 we can see that the overall performance of the two Mixtral-based systems is similar. Again, we see a performance discrepancy between LLaMA and Mixtral. Jiang et al [<xref ref-type="bibr" rid="ref24">24</xref>] show that Mixtral performed better than or comparable to LLaMA2 across multiple benchmarks. Our results suggest that the decision of choosing the right LLM should be made empirically. Note that the two LLaMA models we used have the same number of parameters, 70B. Compared to LLaMA2, LLaMA3.1 improved the results on the ovarian dataset, but fell short on the breast and melanoma datasets. Across 64 evaluation settings (4 cancer types, 4 metrics, 2 subtasks, both development and test sets), LLaMA3.1 achieved higher or same <italic>F</italic><sub>1</sub>-scores as LLaMA2 in 39 cases (61%; see Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Overall, we observe similar trends across strict, relaxed-to-day, relaxed-to-year evaluation settings as relaxed-to-month setting.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Comparison to finetuning-based models in the shared task [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. Scores are relaxed-to-month macro <italic>F</italic><sub>1</sub>-score on the test set. &#x201C;Our EntityBERT, 3 cr&#x201D; refers to the EntityBERT model trained only on the shared task data. The best-performing team in the shared task was LAILab [15].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e67801_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Comparison to LLM prompting systems in the shared task [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Scores are relaxed-to-month macro <italic>F</italic><sub>1</sub>-scores on test set. &#x201C;Our LLaMA2&#x201D; and &#x201C;Our LLaMA3.1&#x201D; are LLaMA2-70B and LLaMA3.1-70B, respectively. &#x201C;Our Mixtral&#x201D; is the Mixtral-8 &#x00D7; 7B-Instruct-v1 model. FS and ZS refer to few-shot and zero-shot settings.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="bioinform_v6i1e67801_fig04.png"/></fig><p>We performed error analysis on the relaxed-to-month output for each cancer type cohort. An incorrect prediction within a predicted patient timeline against a gold patient timeline is either a false positive, that is, a predicted triplet that is not present in the gold timeline, or a false negative, that is, a triplet in the gold timeline, which is not in the predicted timeline. There is also the possibility of an apparent false positive or false negative being actually correct due to an annotation error, for which we also review. We analyze which of the components in the system pipeline or the annotation process is the root cause of an error in the predicted or gold timelines. For the predicted timeline, this can consist of any combination of one of the extraction components for SACT EVENT mentions (SACT Detection Error) and temporal expression mention (TIMEX3 Detection Error), the TLINK classifier (TLINK Error), and summarization error (Total incorrect summarized predictions). For the gold timeline, this can only consist of an annotation error.</p><p>We present the breakdown per error type from the test set in Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We randomly sampled each type of false positive errors to collect a sample size using a 95% CI, a margin of error of 5%, and a population proportion of 50%. We analyzed the instance-level false positives since each was associated with a specific TLINK classification instance. The incorrect unsummarized predictions are inputs to the summarization algorithm which result in the incorrect summarized predictions. We found that most of the errors came from incorrect TLINK classification, followed by annotation errors, and finally detection of SACT EVENT and TIMEX3 mentions. We identified the annotation errors for the most part as resulting from likely missed screening of some notes by the expert annotators, as this is a highly cognitively demanding task for a human to perform (see <xref ref-type="table" rid="table3">Table 3</xref> for examples). The false negatives tended to be the result of formatting issues, complex reasoning, and some level of hedging around the event. We found that in many notes, there are subsections that start with dates, which are used as the headings for these subsections (see examples in &#x201C;False negative: formatting&#x201D; in <xref ref-type="table" rid="table3">Table 3</xref>); then all events described in that subsection are related to these dates. This is especially challenging as the subsections could consist of multiple sentences.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Types of errors and examples. Note that the dates have been intentionally altered for the purpose of this paper.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Type of error</td><td align="left" valign="bottom">Text</td><td align="left" valign="bottom">Explanation</td></tr></thead><tbody><tr><td align="left" valign="top">Annotation error</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Anastrozole (Arimidex) 1 mg once a day by mouth [Order Comment : can take am].</p></list-item><list-item><p>Last dose : 10/18/2033.</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>No gold TLINK<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> for &#x201C;anastrozole (Arimidex)&#x201D; and &#x201C;10/18/2033&#x201D;.</p></list-item></list></td></tr><tr><td align="left" valign="top">Annotation error</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Dr Person17, later today, to discuss management from the standpoint of chemotherapy or hormonal.</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>No gold TLINK for &#x201C;later today&#x201D; and &#x201C;chemotherapy&#x201D;.</p></list-item></list></td></tr><tr><td align="left" valign="top">Annotation error</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Chemo and radiation in 2055.</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>No gold link for &#x201C;chemo&#x201D; and &#x201C;2055&#x201D;.</p></list-item></list></td></tr><tr><td align="left" valign="top">False negative: formatting</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>July through December 2055: Completed his 12 cycles of FOLFOX. The first 8 cycles included oxaliplatin and the last 4 cycles were 5-FU/leucovorin.</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>No prediction TLINK for &#x201C;December 2055&#x201D; and &#x201C;5-FU/leucovorin&#x201D;.</p></list-item><list-item><p>The dates are used as subsection headings with all events related to them.</p></list-item></list></td></tr><tr><td align="left" valign="top">False negative:<break/>complex reasoning</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>November 2055, CEA begins to increase. There is abnormal uptake on a PET scan near the rectosigmoid junction. Patient is then initiated on XELIRI/Avastin in February 2055. [more text<italic>.</italic>.].</p></list-item><list-item><p>May 2055 through August 2055, managed with observation alone off of all chemotherapeutic administration.</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>No prediction TLINK to indicate that XELIRI/Avistin was discontinued May 2055 through August 2055.</p></list-item></list></td></tr><tr><td align="left" valign="top">False negative:<break/>hedging</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>We had attempted to treat him with ipilimumab last week; however, when he got the bathroom in the office, he tripped over a wheel of one of the beds and had a fall.</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Gold TLINK is (last week, CONTAINS, ipilimumab). No predicted TLINK due to the expressed uncertainty of whether the event happened.</p></list-item></list></td></tr><tr><td align="left" valign="top">False positive:<break/>complex reasoning</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>&#x2026;cycles of Cytoxan, fludarabine, and Rituxan chemotherapy through July 2055.</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Predicted TLINKs are correct. However, the treatment is associated with the patient&#x2019;s leukemia, not the melanoma which was the targeted extraction.</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>TLINK: pairwise temporal relations.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The implications of the automatic and faithful extraction of treatment timelines from patients&#x2019; EMRs affect the spectrum of patient-physician interactions, decision-making processes, and advances in cancer research. At the point of care, a clinician presented with the patient&#x2019;s treatment timeline would be able to quickly gain insights into the complex disease and treatment process for that patient, especially helpful in oncology where patients come to specialized centers with hundreds of notes. For research, the automatic generation of timelines opens the door to creating large-scale cohorts to answer important research questions. One such question is related to the treatment regimens as key details in understanding the effects of genetic, epigenetic, and other factors on tumor behavior and responsiveness. As precision oncology progresses, insights into the fine interplay of treatment with tumor molecular characteristics and patient phenotypes become even more critical not only as a source of research data, but as a means of translating findings into patient-tailored therapies similar to those that have been applied to breast cancer and melanoma [<xref ref-type="bibr" rid="ref38">38</xref>].</p><p>Although there is a lot of excitement around LLMs and prompt engineering, there is a major constraint that needs to be factored into engineering decisions&#x2014;that of the length of the input text. This is especially pronounced for tasks where the entire patient EMR narrative needs to be considered, for example, treatment timeline extraction. When considering the input prompt for LLMs, we first considered sending 1 note at a time to LLMs, or concatenating all the sentences that contain SACT EVENT mentions in a note and sending them to LLMs. However, our experiments showed that extracting timelines from long sequences (even just one patient note) was too challenging for the LLMs we evaluated (although these were the SOTA open LLMs). For example, on the ovarian cancer development set, we saw a 10-point drop in relaxed-to-month scores when we sent multiple sentences from the same document to LLaMA2.</p><p>As the error analysis pointed out, the main source of the error is TLINK classification, that is the assignment of the correct temporal relation between an EVENT and TIMEX. The technology we experimented with is LM-based&#x2014;finetuning smaller LMs and LLM prompting. A path of research to improve TLINK extraction lies in combining the outputs of various technologies into an ensemble with a voting mechanism, for example, majority vote or a classification layer. The ensemble could potentially include the output of LLM-based and non&#x2013;LLM-based methods such as classic support vector machines [<xref ref-type="bibr" rid="ref39">39</xref>]. Another potential solution might lie in exploring a 2-stage LLM finetuning strategy, which is a refined ensemble method [<xref ref-type="bibr" rid="ref40">40</xref>]. The first stage decreases bias and variance iteratively, while in the second stage, a selected fixed-bias model is used to further reduce variance due to optimization in ensembling. Soft prompting [<xref ref-type="bibr" rid="ref41">41</xref>] might be another viable path to explore, especially given the availability of labeled data.</p><p>Our experiments show that LLMs struggle with end-to-end timeline extraction from clinical narratives (see <xref ref-type="fig" rid="figure4">Figure 4B</xref>). In <xref ref-type="table" rid="table4">Table 4</xref>, an examination of label distribution across the development set highlights a strong tendency of the system to overproduce BEGINS-ON and ENDS-ON relations while underrepresenting CONTAINS-1. For example, in colorectal cancer, the system predicted 381 BEGINS-ON and 281 ENDS-ON events, vastly exceeding the gold counts of 82 and 73, respectively. A notable source of error in the system&#x2019;s predictions stems from confusion in relation directionality, particularly with the CONTAINS-1 relation. By design, all triples are structured as &#x003C;EVENT, TLINK, TIMEX3&#x003E;, where CONTAINS-1 semantically indicates that the drug was administered on the date specified by the TIMEX3 (see the Tasks and Datasets subsection in the Methods section). However, the system frequently reversed this logic, producing incorrect &#x003C;EVENT, CONTAINS, TIMEX3&#x003E; triples. Such mispredictions not only result in spurious labels (captured under the CONTAINS category in the label distribution) but also reflect a deeper modeling issue: the model&#x2019;s difficulty in internalizing fine-grained relational semantics. To mitigate this, future work could incorporate explicit prompt instruction or soft constraints to enforce the expected directionality of relations during inference in the spirit of constrained decoding [<xref ref-type="bibr" rid="ref42">42</xref>]. In addition, postprocessing steps could validate predicted relations by checking for allowable type-direction combinations, correcting or filtering those that violate domain-specific rules.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Label distribution across the gold timelines and large language model (LLM) predicted timelines (LLAMA2 70B model, end-to-end setting) on the development set.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cancer type</td><td align="left" valign="bottom" colspan="3">Gold timelines, n</td><td align="left" valign="bottom" colspan="4">System timelines, n</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">CONTAINS-1</td><td align="left" valign="top">BEGINS-ON</td><td align="left" valign="top">ENDS-ON</td><td align="left" valign="top">CONTAINS</td><td align="left" valign="top">CONTAINS-1</td><td align="left" valign="top">BEGINS-ON</td><td align="left" valign="top">ENDS-ON</td></tr></thead><tbody><tr><td align="left" valign="top">Breast cancer</td><td align="left" valign="top">16</td><td align="left" valign="top">11</td><td align="left" valign="top">12</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">49</td><td align="left" valign="top">21</td></tr><tr><td align="left" valign="top">Ovarian cancer</td><td align="left" valign="top">65</td><td align="left" valign="top">8</td><td align="left" valign="top">12</td><td align="left" valign="top">7</td><td align="left" valign="top">11</td><td align="left" valign="top">104</td><td align="left" valign="top">38</td></tr><tr><td align="left" valign="top">Melanoma</td><td align="left" valign="top">39</td><td align="left" valign="top">5</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">8</td><td align="left" valign="top">47</td><td align="left" valign="top">22</td></tr><tr><td align="left" valign="top">Colorectal cancer</td><td align="left" valign="top">97</td><td align="left" valign="top">82</td><td align="left" valign="top">73</td><td align="left" valign="top">87</td><td align="left" valign="top">0</td><td align="left" valign="top">381</td><td align="left" valign="top">281</td></tr></tbody></table></table-wrap><p>The error analysis also revealed incorrect annotations in the gold labels. We identified 30 annotation errors in the sample of the shared task dataset (~3.5 million words). The number of annotation errors in the CRC dataset sample is higher, but this is also the largest dataset (12 million+ words). Thus, as a proportion, the estimated annotation error rates across the independent datasets are similar. Annotation error is a standard hazard of the annotation process, especially for a highly cognitively demanding task as the timeline extraction from the entire patient&#x2019;s chart. One has to review every single document from the patient&#x2019;s chart, which for oncology patients translates into hundreds, if not thousands, of notes. Human errors are bound to happen. This further underscores the importance of developing methods for automatic and faithful timeline extraction.</p><p>A curious result emerges on the melanoma dataset. As shown in <xref ref-type="table" rid="table2">Table 2</xref>, the performance on the melanoma dataset is lower than the performance on other types of cancer using task-specific finetuned model. We believe this is caused by the data scarcity in the melanoma dataset because (1) SACT is not the main treatment modality for most melanoma presentations; therefore, there are fewer instances of SACT in the melanoma data and (2) the melanoma test set is the smallest of the 4 datasets. As the evaluation script computed the average <italic>F</italic><sub>1</sub>-scores across all patients, the overall performance on the melanoma test set fluctuated greatly with the score of individual patients.</p><p>In this work, we focus on cancer treatment timeline extraction. However, the methodology described in this work can be applied to treatment timelines extraction of other diseases. For instance, if gold standard datasets are available for an out-of-domain disease type, one can finetune a small LM for temporal relation extraction. If gold annotations are not available for a type of disease, prompting LLMs with a few domain-specific examples would be a viable solution.</p></sec><sec id="s4-2"><title>Limitations</title><p>In this work, we did not use powerful, but proprietary LLMs such as GPT-4 [<xref ref-type="bibr" rid="ref26">26</xref>] or Gemini [<xref ref-type="bibr" rid="ref43">43</xref>], as we do not have access to nonretaining versions of these models for large scale processing. Despite the fact that our dataset was deidentified per HIPAA requirements, we did not feel that it was ethically appropriate to submit patient-derived data to a retaining LLM. However, experimenting with open models presents a realistic scenario for the average academic center as experimenting with proprietary LLMs comes at a significant cost. The LLMs we selected in our study were those reported to have competitive performance to proprietary models [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. During paper revision, the DeepSeek-R1 [<xref ref-type="bibr" rid="ref44">44</xref>] open model was released which outperformed the proprietary models on several general benchmarks. We leave experimentation with it as a future study. We did not use prompting techniques such as chain-of-thought [<xref ref-type="bibr" rid="ref45">45</xref>] because it is not clear how to directly convert a complex task such as timeline extraction from the entire EMR clinical narrative into a series of reasoning steps. We leave the exploration of using HIPAA-compliant versions of proprietary LLMs (access-dependent) and other prompting methods such as prompt-tuning [<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>] for future research. Another limitation is that the datasets represent 2 medical centers and thus may introduce institutional or regional biases. However, to the best of our knowledge, these datasets are the only ones on cancer treatment timelines available to the community. In addition, this study focuses on colorectal cancer, breast cancer, ovarian cancer, and melanoma. While these common cancer types are broadly representative, future work should extend the SACT timeline extraction task to other cancer types. We should note that such pan-cancer extensions necessitate significant resources for the creation of the gold annotations. We also acknowledge that some cancer journeys are complex, with lines of therapy containing SACT interspersed with other therapeutic modalities such as radiation; these complexities are out of scope for the current approach but should be a focus of future work. Finally, this work uses an established set of predefined temporal relations (CONTAINS, BEGINS-ON, ENDS-ON, OVERLAP, and BEFORE) and preexisting annotations. We acknowledge that modeling more complex and nuanced temporal scenarios might potentially provide additional insights; however, this is the core set the clinical temporal information extraction community has converged on with some minor nuances [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p></sec><sec id="s4-3"><title>Conclusions</title><p>In this paper, we explored approaches for patient-level timeline extraction through the task of SACT timeline extraction. We performed experiments on the 2024 ChemoTimelines shared task as well as on the THYME dataset, thus the data represented 4 types of cancer across two institutions. We finetuned an LM that was specifically trained to attend to EVENT and TIMEX3 mentions. In that, we achieved higher scores than all shared task participants in Subtask1. We also explored LLM-based systems via prompting. In both subtasks, our LLM-based systems outperformed the shared task participant systems that took the approach of prompting LLMs. Our results contribute to the body of work that shows that task-specific finetuning based on rich, disease-specific datasets outperforms prompting the current generalist LLMs. We believe our results and analysis on this task add to the knowledge of extracting treatment timelines in EMRs using NLP methods. Our code will be released publicly upon acceptance.</p></sec></sec></body><back><ack><p>This paper is the result of funding in whole or in part by the National Institutes of Health (NIH). It is subject to the NIH Public Access Policy. Through acceptance of this federal funding, NIH has been given a right to make this manuscript publicly available in PubMed Central upon the Official Date of Publication, as defined by NIH. Funding is provided by the US NIH (grants U24CA248010 GS/EG/SF/DH/PdeG/JL/HH/EB/JW, R01LM010090 GS/JY/DH, R01LM013486 JY, and R01CA294033 DB). The content is solely the responsibility of the authors and does not necessarily represent the official views of the US NIH.</p></ack><notes><sec><title>Data Availability</title><p>The colorectal cancer dataset analyzed during this study is available to those involved in natural language processing research under a data use agreement (DUA) with Mayo Clinic. The corpus is distributed through the hNLP Center (center.healthnlp.org). The breast cancer, ovarian cancer, and melanoma datasets analyzed during this study are available under a DUA. Please contact author HH (email: harryh@pitt.edu) for details.</p></sec></notes><fn-group><fn fn-type="con"><p>JY contributed to conceptualization and methodology, performed the experiments with the assistance of EG and GS, and contributed to formal analysis, writing the original draft, review, and editing. EG contributed to conceptualization, methodology, software, formal analysis, writing the original draft, review, and editing. HH and SF contributed to data acquisition, writing &#x2013; review and editing. JL contributed to data acquisition. DH contributed to data curation and error analysis. PCdeG contributed to subject matter expertise, data curation, and writing &#x2013; review and editing. EB contributed subject matter expertise. DB contributed to subject matter expertise and writing &#x2013; review and editing. JLW contributed to subject matter expertise and writing &#x2013; review and editing. GS contributed to conceptualization, data curation, methodology, formal analysis, writing the original draft, review and editing, funding acquisition, and project administration. All authors discussed the results and commented on the manuscript.</p></fn><fn fn-type="conflict"><p>EB serves as a consultant/advisory board member for Pfizer, Werewolf Pharma, Merck, Iovance, Sanofi, Xilio, and Novartis. Clinical trial support from Lilly, Novartis, Partners Therapeutics, Genentech, and BVD. JLW reports consulting for Westat, The Lewin Group, and ownership in HemOnc.org LLC. He is also editor-in-chief of JCO CCI. DB reports Scientific Advisory Board membership for MercurialAI. She is also associate editor of Radiation Oncology, HemOnc.org (no financial compensation, unrelated to this work) and an associate editor of JCO CCI; funding from American Association for Cancer Research (unrelated to this work). None declared by the other authors.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BIO</term><def><p>beginning, inside, outside</p></def></def-item><def-item><term id="abb2">CRC</term><def><p>colorectal cancer</p></def></def-item><def-item><term id="abb3">DocTime</term><def><p>Document Creation Time</p></def></def-item><def-item><term id="abb4">DocTimeRel</term><def><p>relation with the Document Creation Time</p></def></def-item><def-item><term id="abb5">EMR</term><def><p>electronic medical record</p></def></def-item><def-item><term id="abb6">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb7">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb8">LM</term><def><p>language model</p></def></def-item><def-item><term id="abb9">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb10">SACT</term><def><p>systemic anticancer therapy</p></def></def-item><def-item><term id="abb11">SOTA</term><def><p>state-of-the-art</p></def></def-item><def-item><term id="abb12">THYME</term><def><p>Temporal Histories of Your Medical Event</p></def></def-item><def-item><term id="abb13">TIMEX3</term><def><p>time expressions</p></def></def-item><def-item><term id="abb14">TLINK</term><def><p>pairwise temporal relation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Styler</surname><given-names>WF</given-names>  <suffix>4th</suffix></name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Finan</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Temporal annotation in the clinical domain</article-title><source>Trans Assoc Comput Linguist</source><year>2014</year><month>04</month><volume>2</volume><fpage>143</fpage><lpage>154</lpage><pub-id pub-id-type="doi">10.1162/tacl_a_00172</pub-id><pub-id pub-id-type="medline">29082229</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>W</given-names> </name><name name-style="western"><surname>Rumshisky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Uzuner</surname><given-names>O</given-names> </name></person-group><article-title>Evaluating temporal relations in clinical text: 2012 i2b2 Challenge</article-title><source>J Am Med Inform Assoc</source><year>2013</year><volume>20</volume><issue>5</issue><fpage>806</fpage><lpage>813</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2013-001628</pub-id><pub-id pub-id-type="medline">23564629</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Derczynski</surname><given-names>L</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name><name name-style="western"><surname>Pustejovsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Verhagen</surname><given-names>M</given-names> </name></person-group><article-title>SemEval-2015 task 6: clinical tempeval</article-title><conf-name>Proceedings of the 9th International Workshop on Semantic Evaluation (SemEval 2015)</conf-name><conf-date>Jun 4-5, 2015</conf-date><conf-loc>Denver, Colorado</conf-loc><pub-id pub-id-type="doi">10.18653/v1/S15-2136</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>WT</given-names> </name><name name-style="western"><surname>Derczynski</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pustejovsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Verhagen</surname><given-names>M</given-names> </name></person-group><article-title>SemEval-2016 task 12: clinical tempeval</article-title><conf-name>Proceedings of the 10th International Workshop on Semantic Evaluation (SemEval-2016)</conf-name><conf-date>Jun 16-17, 2016</conf-date><conf-loc>San Diego, California</conf-loc><pub-id pub-id-type="doi">10.18653/v1/S16-1165</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name><name name-style="western"><surname>Palmer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pustejovsky</surname><given-names>J</given-names> </name></person-group><article-title>SemEval-2017 task 12: clinical tempeval</article-title><conf-name>Proceedings of the 11th International Workshop on Semantic Evaluation (SemEval-2017)</conf-name><conf-date>Aug 3-4, 2017</conf-date><conf-loc>Vancouver, Canada</conf-loc><pub-id pub-id-type="doi">10.18653/v1/S17-2093</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Laparra</surname><given-names>E</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Elsayed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Palmer</surname><given-names>M</given-names> </name></person-group><article-title>SemEval 2018 task 6: parsing time normalizations</article-title><conf-name>Proceedings of The 12th International Workshop on Semantic Evaluation</conf-name><conf-date>Jun 5-6, 2018</conf-date><conf-loc>New Orleans, Louisiana</conf-loc><pub-id pub-id-type="doi">10.18653/v1/S18-1011</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>Neural temporal relation extraction</article-title><conf-name>Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers</conf-name><conf-date>Apr 3-7, 2017</conf-date><conf-loc>Valencia, Spain</conf-loc><pub-id pub-id-type="doi">10.18653/v1/E17-2118</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Amiri</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>Self-training improves recurrent neural networks performance for temporal relation extraction</article-title><conf-name>Proceedings of the Ninth International Workshop on Health Text Mining and Information Analysis</conf-name><conf-date>Nov 1, 2018</conf-date><conf-loc>Brussels, Belgium</conf-loc><pub-id pub-id-type="doi">10.18653/v1/W18-5619</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>A BERT-based universal model for both within- and cross-sentence clinical temporal relation extraction</article-title><conf-name>Proceedings of the 2nd Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 9, 2019</conf-date><conf-loc>Minneapolis, Minnesota, USA</conf-loc><pub-id pub-id-type="doi">10.18653/v1/W19-1908</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sadeque</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>A BERT-based one-pass multi-task model for clinical temporal relation extraction</article-title><conf-name>Proceedings of the 19th SIGBioMed Workshop on Biomedical Language Processing</conf-name><conf-date>Jun 9, 2020</conf-date><pub-id pub-id-type="doi">10.18653/v1/2020.bionlp-1.7</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dligach</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>EntityBERT: entity-centric masking strategy for model pretraining for the clinical domain</article-title><conf-name>Proceedings of the 20th Workshop on Biomedical Language Processing</conf-name><conf-date>Jun 11, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.bionlp-1.21</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Ananiadou</surname><given-names>S</given-names> </name></person-group><article-title>Zero-shot temporal relation extraction with chatgpt</article-title><conf-name>The 22nd Workshop on Biomedical Natural Language Processing and BioNLP Shared Tasks</conf-name><conf-date>Jul 13, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2023.bionlp-1.7</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hochheiser</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>W</given-names> </name><name name-style="western"><surname>Goldner</surname><given-names>E</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>Overview of the 2024 shared task on chemotherapy treatment timeline extraction</article-title><conf-name>Proceedings of the 6th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.53</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wright-Bettner</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Defining and learning refined temporal relations in the clinical narrative</article-title><conf-name>Proceedings of the 11th International Workshop on Health Text Mining and Information Analysis</conf-name><conf-date>Nov 20, 2020</conf-date><pub-id pub-id-type="doi">10.18653/v1/2020.louhi-1.12</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Haddadan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Le</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Duong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Thieu</surname><given-names>T</given-names> </name></person-group><article-title>LAILab at chemotimelines 2024: finetuning sequence-to-sequence language models for temporal relation extraction towards cancer patient undergoing chemotherapy treatment</article-title><conf-name>Proceedings of the 6th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.37</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Li</surname><given-names>R</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name></person-group><article-title>Wonder at chemotimelines 2024: medtimeline: an end-to-end NLP system for timeline extraction from clinical narratives</article-title><conf-name>Proceedings of the 6th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.48</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>V</given-names> </name><name name-style="western"><surname>Fernandez</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ioanovici</surname><given-names>A</given-names> </name><name name-style="western"><surname>Talby</surname><given-names>D</given-names> </name><name name-style="western"><surname>Buijs</surname><given-names>F</given-names> </name></person-group><article-title>Lexicans at chemotimelines 2024: chemotimeline chronicles - leveraging large language models (llms) for temporal relations extraction in oncological electronic health records</article-title><conf-name>Proceedings of the 6th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.38</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bannour</surname><given-names>N</given-names> </name><name name-style="western"><surname>Andrew</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Vincent</surname><given-names>M</given-names> </name></person-group><article-title>Team nlpeers at chemotimelines 2024: evaluation of two timeline extraction methods, can generative LLM do it all or is smaller model fine-tuning still relevant?</article-title><conf-name>Proceedings of the 6th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.39</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chung</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Longpre</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zoph</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fedus</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Scaling instruction-finetuned language models</article-title><source>J Mach Learn Res</source><year>2024</year><access-date>2025-07-31</access-date><volume>25</volume><fpage>1</fpage><lpage>53</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://jmlr.org/papers/volume25/23-0870/23-0870.pdf">https://jmlr.org/papers/volume25/23-0870/23-0870.pdf</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Goyal</surname><given-names>N</given-names> </name><etal/></person-group><article-title>BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension</article-title><conf-name>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</conf-name><conf-date>Jul 5-10, 2020</conf-date><pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.703</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ott</surname><given-names>M</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><name name-style="western"><surname>Stoyanov</surname><given-names>V</given-names> </name></person-group><article-title>Pretrained language models for biomedical and clinical tasks: understanding and extending the state-of-the-art</article-title><conf-name>Proceedings of the 3rd Clinical Natural Language Processing Workshop</conf-name><conf-date>Nov 19, 2020</conf-date><pub-id pub-id-type="doi">10.18653/v1/2020.clinicalnlp-1.17</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tinn</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title><source>ACM Trans Comput Healthcare</source><year>2022</year><month>01</month><day>31</day><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1145/3458754</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>K</given-names> </name><name name-style="western"><surname>Albert</surname><given-names>P</given-names> </name><name name-style="western"><surname>Almahairi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Babaei</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Llama 2: open foundation and fine-tuned chat models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 19, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.09288</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Roux</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mensch</surname><given-names>A</given-names> </name><name name-style="western"><surname>Savary</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bamford</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Mixtral of experts</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 8, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.04088</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><name name-style="western"><surname>Subbiah</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dhariwal</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  May 28, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>OpenAI</collab></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Raffel</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shazeer</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Exploring the limits of transfer learning with a unified text-to-text transformer</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 23, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1910.10683</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kweon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ku</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>A</given-names> </name><name name-style="western"><surname>Srikumar</surname><given-names>V</given-names> </name></person-group><article-title>Publicly shareable clinical large language model built on synthetic clinical notes</article-title><conf-name>Findings of the Association for Computational Linguistics ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><conf-loc>Bangkok, Thailand</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.305</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Team</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mesnard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hardin</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Gemma: open models based on gemini research and technology</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 16, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2403.08295</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</conf-name><conf-date>Jun 2-7, 2019</conf-date><conf-loc>Minneapolis, Minnesota</conf-loc><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savova</surname><given-names>GK</given-names> </name><name name-style="western"><surname>Masanz</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Ogren</surname><given-names>PV</given-names> </name><etal/></person-group><article-title>Mayo clinical Text Analysis and Knowledge Extraction System (cTAKES): architecture, component evaluation and applications</article-title><source>J Am Med Inform Assoc</source><year>2010</year><volume>17</volume><issue>5</issue><fpage>507</fpage><lpage>513</lpage><pub-id pub-id-type="doi">10.1136/jamia.2009.001560</pub-id><pub-id pub-id-type="medline">20819853</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name></person-group><article-title>A synchronous context free grammar for time normalization</article-title><conf-name>Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Oct 18-21, 2013</conf-date><conf-loc>Seattle, Washington, USA</conf-loc><pub-id pub-id-type="doi">10.18653/v1/D13-1078</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Parker</surname><given-names>J</given-names> </name></person-group><article-title>A semantically compositional annotation scheme for time normalization</article-title><access-date>2025-07-31</access-date><conf-name>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC&#x2019;16)</conf-name><conf-date>May 23-28, 2016</conf-date><conf-loc>Portoro&#x017E;, Slovenia</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/L16-1599/">https://aclanthology.org/L16-1599/</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pustejovsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bunt</surname><given-names>H</given-names> </name><name name-style="western"><surname>Romary</surname><given-names>L</given-names> </name></person-group><article-title>ISO-timeml: an international standard for semantic annotation</article-title><access-date>2025-07-31</access-date><conf-name>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC&#x2019;10)</conf-name><conf-date>May 17-23, 2010</conf-date><conf-loc>Valletta, Malta</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="http://www.lrec-conf.org/proceedings/lrec2010/pdf/55_Paper.pdf">http://www.lrec-conf.org/proceedings/lrec2010/pdf/55_Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lavril</surname><given-names>T</given-names> </name><name name-style="western"><surname>Izacard</surname><given-names>G</given-names> </name><name name-style="western"><surname>Martinet</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lachaux</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Lacroix</surname><given-names>T</given-names> </name><etal/></person-group><article-title>LLaMA: open and efficient foundation language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.13971</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Dede</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name></person-group><article-title>KCLab at chemotimelines 2024: end-to-end system for chemotherapy timeline extraction &#x2013; subtask2</article-title><conf-name>Proceedings of the 6th Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 21, 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.clinicalnlp-1.40</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shin</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Bode</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>Z</given-names> </name></person-group><article-title>Addressing the challenges of applying precision oncology</article-title><source>NPJ Precis Oncol</source><year>2017</year><volume>1</volume><issue>1</issue><fpage>28</fpage><pub-id pub-id-type="doi">10.1038/s41698-017-0032-z</pub-id><pub-id pub-id-type="medline">29872710</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cortes</surname><given-names>C</given-names> </name><name name-style="western"><surname>Vapnik</surname><given-names>V</given-names> </name></person-group><article-title>Support-vector networks</article-title><source>Mach Learn</source><year>1995</year><month>09</month><volume>20</volume><issue>3</issue><fpage>273</fpage><lpage>297</lpage><pub-id pub-id-type="doi">10.1007/BF00994018</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>Two-stage fine-tuning for improved bias and variance for large pretrained language models</article-title><conf-name>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1</conf-name><conf-date>Jul 9-14, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2023.acl-long.877</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Perova</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Mandloi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lewis</surname><given-names>E</given-names> </name><name name-style="western"><surname>Parkinson</surname><given-names>H</given-names> </name><name name-style="western"><surname>Savova</surname><given-names>G</given-names> </name></person-group><article-title>Extracting knowledge from scientific texts on patient-derived cancer models using large language models: algorithm development and validation</article-title><source>bioRxiv</source><comment>Preprint posted online on  Jan 29, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.01.28.634527</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Geng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Josifoski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Peyrard</surname><given-names>M</given-names> </name><name name-style="western"><surname>West</surname><given-names>R</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Bouamor</surname><given-names>H</given-names> </name><name name-style="western"><surname>Pino</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bali</surname><given-names>K</given-names> </name></person-group><article-title>Grammar-constrained decoding for structured NLP tasks without finetuning</article-title><conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 6-10, 2023</conf-date><conf-loc>Singapore</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.674</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Saab</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Tanno</surname><given-names>R</given-names> </name><name name-style="western"><surname>Stutz</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wulczyn</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Capabilities of gemini models in medicine</article-title><source>arXiv</source><comment>Preprint posted online on  May 1, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.18416</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>DeepSeek-AI</surname><given-names>GD</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><etal/></person-group><article-title>DeepSeek-R1: incentivizing reasoning capability in llms via reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 22, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 28, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2201.11903</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lester</surname><given-names>B</given-names> </name><name name-style="western"><surname>Al-Rfou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Constant</surname><given-names>N</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Moens</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Specia</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yih</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Eds.</surname><given-names>O</given-names> </name><name name-style="western"><surname>Cana</surname><given-names>P</given-names> </name></person-group><article-title>The power of scale for parameter-efficient prompt tuning</article-title><conf-name>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 7-11, 2021</conf-date><conf-loc>Punta Cana, Dominican Republic</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.243</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>XL</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Zong</surname><given-names>C</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>F</given-names> </name><name name-style="western"><surname>Li</surname><given-names>W</given-names> </name><name name-style="western"><surname>Navigli</surname><given-names>R</given-names> </name></person-group><article-title>Prefix-tuning: optimizing continuous prompts for generation</article-title><conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing</conf-name><conf-date>Aug 1-6, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.353</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Du</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>GPT understands, too</article-title><source>AI Open</source><year>2024</year><volume>5</volume><fpage>208</fpage><lpage>215</lpage><pub-id pub-id-type="doi">10.1016/j.aiopen.2023.08.012</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>More details on data statistics, experimental settings, results, and error analysis.</p><media xlink:href="bioinform_v6i1e67801_app1.docx" xlink:title="DOCX File, 44 KB"/></supplementary-material></app-group></back></article>