fullrecord |
[{"key": "dc.contributor.advisor", "value": "\u00c4yr\u00e4m\u00f6, Sami", "language": "", "element": "contributor", "qualifier": "advisor", "schema": "dc"}, {"key": "dc.contributor.advisor", "value": "Ruohonen, Toni", "language": "", "element": "contributor", "qualifier": "advisor", "schema": "dc"}, {"key": "dc.contributor.advisor", "value": "Moilanen, Miika", "language": "", "element": "contributor", "qualifier": "advisor", "schema": "dc"}, {"key": "dc.contributor.author", "value": "Ihalainen, Simo", "language": "", "element": "contributor", "qualifier": "author", "schema": "dc"}, {"key": "dc.date.accessioned", "value": "2022-01-07T06:56:24Z", "language": null, "element": "date", "qualifier": "accessioned", "schema": "dc"}, {"key": "dc.date.available", "value": "2022-01-07T06:56:24Z", "language": null, "element": "date", "qualifier": "available", "schema": "dc"}, {"key": "dc.date.issued", "value": "2022", "language": "", "element": "date", "qualifier": "issued", "schema": "dc"}, {"key": "dc.identifier.uri", "value": "https://jyx.jyu.fi/handle/123456789/79242", "language": null, "element": "identifier", "qualifier": "uri", "schema": "dc"}, {"key": "dc.description.abstract", "value": "Terveydenhuollossa suuri m\u00e4\u00e4r\u00e4 dataa on tallennettuna elektronisiin potilastietoj\u00e4rjestelmiin potilaskertomusten muodossa. Potilaskertomustekstien tehokas hy\u00f6dynt\u00e4minen p\u00e4ivitt\u00e4isess\u00e4 hoitoty\u00f6ss\u00e4 ja kliinisess\u00e4 tutkimuksessa vaatii edistyneiden luonnollisen kielen k\u00e4sittelyalgoritmien k\u00e4ytt\u00f6\u00e4 oleellisen data poimimiseksi potilasteksteist\u00e4. Monet t\u00e4h\u00e4n tarkoitukseen soveltuvat koneoppimisen menetelm\u00e4t vaativat suuria m\u00e4\u00e4ri\u00e4 luokiteltua opetusdataa k\u00e4ytett\u00e4v\u00e4ksi mallin koulutukseen, mik\u00e4 on potilaskertomusten tapauksessa aikaa viev\u00e4\u00e4 ja kallista toteuttaa. T\u00e4m\u00e4n opinn\u00e4ytety\u00f6n tarkoituksena oli tutkia automaattista opetusdatan luokittelua ja automaattisesti luodulla opetusdatalla koulutettujen mallien suorituskyky\u00e4 kahden l\u00e4\u00e4ketie-teellisen riskitekij\u00e4n (korkea kolesteroli, haitallinen alkoholink\u00e4ytt\u00f6) luokitteluun potilaskertomuksista. Kehitettyjen s\u00e4\u00e4nt\u00f6jen avulla luotiin automaattisesti luokiteltu opetusdatasetti, jota k\u00e4ytettiin eri koneoppimismallien kouluttamiseen. Samat mallit koulutettiin my\u00f6s manuaalisesti luokitellulla 200 lauseen opetusdatasetill\u00e4. BERT-malli saavutti parhaan luokittelutarkkuuden sek\u00e4 kolesterolin (94 %) ett\u00e4 alkoholin (91 %) tapauksessa. BERT-malli pystyi hy\u00f6dynt\u00e4m\u00e4\u00e4n luonnollisen kielen ymm\u00e4rryst\u00e4 ja saavuttamaan paremman luokittelutarkkuuden kuin mihin opetusdatan luomiseen k\u00e4ytetyt s\u00e4\u00e4nn\u00f6t pystyiv\u00e4t. Kaikki automaattisesti luodulla opetusdatalla koulutetut mallit p\u00e4\u00e4siv\u00e4t parempaan luokittelutarkkuuteen kuin mihin vastaavat pienell\u00e4 manuaalisesti luokitellulla opetusdatalla koulutetut mallit pystyiv\u00e4t. Automaattinen opetusdatan luokittelu saattaisi olla arvokas ty\u00f6kalu koneoppimisprojektien kustannusten pienent\u00e4miseen tilanteissa, joissa opetusdatan manuaalinen luokittelu on aikaa viev\u00e4\u00e4, kallista ja vaatii sovellusalan asiantuntijan ty\u00f6panosta.", "language": "fi", "element": "description", "qualifier": "abstract", "schema": "dc"}, {"key": "dc.description.abstract", "value": "Large amounts of patient data is stored in electronic health records in unstructured data form as clinical narratives. The efficient use of clinical narratives in day-to-day care and clinical research requires advanced natural language processing methods to extract data from the texts. The common problem for many deep learning algorithms is the requirement for vast amounts of labeled training data, which is time consuming and expensive to acquire in the clinical narrative context. The purpose of this thesis was to assess a weak supervision based approach in automatic training data labeling, and the subsequent machine learning model per-formance in classifying two medical risk factors in Finnish language clinical narratives: high cholesterol and alcohol consumption. Heuristic rules were developed to automatically label sentences collected from clinical narratives to create a training dataset. Different machine learning models were trained with automatically labeled training dataset and with 200 manually labeled sentences. BERT model achieved the highest overall classification accuracy of 94 % in cholesterol task and 91 % in alcohol task. BERT model was able to capture hidden patterns in the data and leverage the natural language understanding to produce better classification results and classify cases which were not captured by the rules used to create the training data. All machine learning models trained with the automatically labeled data produced better classification results compared to the models trained with a small manually labeled dataset. Weak supervision approach might be a valuable tool to reduce the costs of applying machine learning algorithms in low-resource settings, where manual labeling process is time consuming, expensive, or requires the expertise of subject specialist.", "language": "en", "element": "description", "qualifier": "abstract", "schema": "dc"}, {"key": "dc.description.provenance", "value": "Submitted by Paivi Vuorio (paelvuor@jyu.fi) on 2022-01-07T06:56:24Z\nNo. of bitstreams: 0", "language": "en", "element": "description", "qualifier": "provenance", "schema": "dc"}, {"key": "dc.description.provenance", "value": "Made available in DSpace on 2022-01-07T06:56:24Z (GMT). No. of bitstreams: 0\n Previous issue date: 2022", "language": "en", "element": "description", "qualifier": "provenance", "schema": "dc"}, {"key": "dc.format.extent", "value": "48", "language": "", "element": "format", "qualifier": "extent", "schema": "dc"}, {"key": "dc.format.mimetype", "value": "application/pdf", "language": null, "element": "format", "qualifier": "mimetype", "schema": "dc"}, {"key": "dc.language.iso", "value": "eng", "language": null, "element": "language", "qualifier": "iso", "schema": "dc"}, {"key": "dc.rights", "value": "In Copyright", "language": "en", "element": "rights", "qualifier": null, "schema": "dc"}, {"key": "dc.subject.other", "value": "natural language processing", "language": "", "element": "subject", "qualifier": "other", "schema": "dc"}, {"key": "dc.subject.other", "value": "clinical narratives", "language": "", "element": "subject", "qualifier": "other", "schema": "dc"}, {"key": "dc.subject.other", "value": "text analytics", "language": "", "element": "subject", "qualifier": "other", "schema": "dc"}, {"key": "dc.subject.other", "value": "medical risk factors", "language": "", "element": "subject", "qualifier": "other", "schema": "dc"}, {"key": "dc.subject.other", "value": "weak supervision", "language": "", "element": "subject", "qualifier": "other", "schema": "dc"}, {"key": "dc.subject.other", "value": "automatic training data labeling", "language": "", "element": "subject", "qualifier": "other", "schema": "dc"}, {"key": "dc.title", "value": "Automatic training data labeling for Finnish clinical narrative NLP tasks", "language": "", "element": "title", "qualifier": null, "schema": "dc"}, {"key": "dc.type", "value": "master thesis", "language": null, "element": "type", "qualifier": null, "schema": "dc"}, {"key": "dc.identifier.urn", "value": "URN:NBN:fi:jyu-202201071021", "language": "", "element": "identifier", "qualifier": "urn", "schema": "dc"}, {"key": "dc.type.ontasot", "value": "Pro gradu -tutkielma", "language": "fi", "element": "type", "qualifier": "ontasot", "schema": "dc"}, {"key": "dc.type.ontasot", "value": "Master\u2019s thesis", "language": "en", "element": "type", "qualifier": "ontasot", "schema": "dc"}, {"key": "dc.contributor.faculty", "value": "Informaatioteknologian tiedekunta", "language": "fi", "element": "contributor", "qualifier": "faculty", "schema": "dc"}, {"key": "dc.contributor.faculty", "value": "Faculty of Information Technology", "language": "en", "element": "contributor", "qualifier": "faculty", "schema": "dc"}, {"key": "dc.contributor.department", "value": "Informaatioteknologia", "language": "fi", "element": "contributor", "qualifier": "department", "schema": "dc"}, {"key": "dc.contributor.department", "value": "Information Technology", "language": "en", "element": "contributor", "qualifier": "department", "schema": "dc"}, {"key": "dc.contributor.organization", "value": "Jyv\u00e4skyl\u00e4n yliopisto", "language": "fi", "element": "contributor", "qualifier": "organization", "schema": "dc"}, {"key": "dc.contributor.organization", "value": "University of Jyv\u00e4skyl\u00e4", "language": "en", "element": "contributor", "qualifier": "organization", "schema": "dc"}, {"key": "dc.subject.discipline", "value": "Tietotekniikka", "language": "fi", "element": "subject", "qualifier": "discipline", "schema": "dc"}, {"key": "dc.subject.discipline", "value": "Mathematical Information Technology", "language": "en", "element": "subject", "qualifier": "discipline", "schema": "dc"}, {"key": "yvv.contractresearch.collaborator", "value": "business", "language": "", "element": "contractresearch", "qualifier": "collaborator", "schema": "yvv"}, {"key": "yvv.contractresearch.funding", "value": "0", "language": "", "element": "contractresearch", "qualifier": "funding", "schema": "yvv"}, {"key": "yvv.contractresearch.initiative", "value": "business", "language": "", "element": "contractresearch", "qualifier": "initiative", "schema": "yvv"}, {"key": "dc.type.coar", "value": "http://purl.org/coar/resource_type/c_bdcc", "language": null, "element": "type", "qualifier": "coar", "schema": "dc"}, {"key": "dc.rights.accesslevel", "value": "openAccess", "language": null, "element": "rights", "qualifier": "accesslevel", "schema": "dc"}, {"key": "dc.type.publication", "value": "masterThesis", "language": null, "element": "type", "qualifier": "publication", "schema": "dc"}, {"key": "dc.subject.oppiainekoodi", "value": "602", "language": "", "element": "subject", "qualifier": "oppiainekoodi", "schema": "dc"}, {"key": "dc.subject.yso", "value": "sairauskertomukset", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "koneoppiminen", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "tiedonlouhinta", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "luonnollinen kieli", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "NLP", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "case records (patient documents)", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "machine learning", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "data mining", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "natural language", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.subject.yso", "value": "NLP", "language": null, "element": "subject", "qualifier": "yso", "schema": "dc"}, {"key": "dc.format.content", "value": "fulltext", "language": null, "element": "format", "qualifier": "content", "schema": "dc"}, {"key": "dc.rights.url", "value": "https://rightsstatements.org/page/InC/1.0/", "language": null, "element": "rights", "qualifier": "url", "schema": "dc"}, {"key": "dc.type.okm", "value": "G2", "language": null, "element": "type", "qualifier": "okm", "schema": "dc"}]
|