diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..6fffd10b --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2020 Jimmy Lin + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index a46be9c7..6f425524 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,39 @@ # PyGaggle -A gaggle of CORD-19 rerankers. +A gaggle of rerankers for [CovidQA](https://github.com/castorini/pygaggle/blob/master/data/) and CORD-19. ## Installation -1. `conda env create -f environment.yml && conda activate pygaggle` +1. For pip, do `pip install pygaggle`. If you prefer Anaconda, use `conda env create -f environment.yml && conda activate pygaggle`. 2. Install [PyTorch 1.4+](http://pytorch.org/). -3. Download the index: `sh scripts/update-index.sh` +3. Download the index: `sh scripts/update-index.sh`. -4. Make sure you have an installation of Java 8+: `javac --version` +4. Make sure you have an installation of Java 11+: `javac --version`. +5. Install [Anserini](https://github.com/castorini/anserini). -## Evaluating Highlighters -Run `sh scripts/evaluate-highlighters.sh`. \ No newline at end of file +## Running rerankers on CovidQA + +By default, the script uses `data/lucene-index-covid-paragraph` for the index path. +If this is undesirable, set the environment variable `CORD19_INDEX_PATH` to the path of the index. + + +### Unsupervised Methods + +**BM25**: `python -um pygaggle.run.evaluate_kaggle_highlighter --method bm25` + +**BERT**: `python -um pygaggle.run.evaluate_kaggle_highlighter --method transformer --model-name bert-base-cased` + +**SciBERT**: `python -um pygaggle.run.evaluate_kaggle_highlighter --method transformer --model-name allenai/scibert_scivocab_cased` + +**BioBERT**: `python -um pygaggle.run.evaluate_kaggle_highlighter --method transformer --model-name biobert` + + +### Supervised Methods + +**T5 (MARCO)**: `python -um pygaggle.run.evaluate_kaggle_highlighter --method t5` + +Instructions for our other MARCO and SQuAD models coming soon. diff --git a/data/kaggle-lit-review-0.1.json b/data/kaggle-lit-review-0.1.json new file mode 100644 index 00000000..00f20eb3 --- /dev/null +++ b/data/kaggle-lit-review-0.1.json @@ -0,0 +1,862 @@ +{ + "version":"0.1", + "categories":[ + { + "name":"Incubation period", + "sub_categories":[ + { + "nq_name":"What is the incubation period of the virus?", + "kq_name":"Incubation period of the virus", + "answers":[ + { + "id":"wuclekt6", + "title":"Longitudinal analysis of laboratory findings during the process of recovery for patients with COVID-19", + "exact_answer":"4 days (IQR, 2-7)" + }, + { + "id":"e3t1f0rt", + "title":"Epidemiological Characteristics of COVID-19; a Systemic Review and Meta-Analysis 1", + "exact_answer":"5.84 (99% CI: 4.83, 6.85) days" + }, + { + "id":"ragcpbl6", + "title":"Evolving epidemiology of novel coronavirus diseases 2019 and possible interruption of local transmission outside Hubei Province in China: a descriptive and modeling study", + "exact_answer":"mean incubation period of 5.2 days" + }, + { + "id":"n0uwy77g", + "title":"Clinical characteristics and durations of hospitalized patients with COVID-19 in Beijing: a retrospective cohort study", + "exact_answer":"4 (3-7) days" + }, + { + "id":"x23ej29m", + "title":"Clinical features and obstetric and neonatal outcomes of pregnant patients with COVID-19 in Wuhan, China: a retrospective, single-centre, descriptive study", + "exact_answer":"5 days (range 2-9 days)" + }, + { + "id":"56zhxd6e", + "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries Summary Background", + "exact_answer":"7.44 days" + }, + { + "id":"zph6r4il", + "title":"Epidemiological, clinical and virological characteristics of 74 cases of coronavirus-infected disease 2019 (COVID-19) with gastrointestinal symptoms", + "exact_answer":"4 days (IQR 3-7 days)" + }, + { + "id":"djq0lvr2", + "title":"Is a 14-day quarantine period optimal for effectively controlling coronavirus disease 2019 (COVID-19)?", + "exact_answer":"The median incubation period of both male and female adults was similar (7-day) but significantly shorter than that (9-day) of child cases" + }, + { + "id":"n0vmb946", + "title":"The difference in the incubation period of 2019 novel coronavirus (SARS-CoV-2) infection between travelers to Hubei and non-travelers: The need of a longer quarantine period", + "exact_answer":"1.8 and 7.2 days" + }, + { + "id":"awgyxn3t", + "title":"Clinical Characteristics of 34 Children with Coronavirus Disease-2019 in the West of China: a Multiple-center Case Series", + "exact_answer":"10.50 (7.75 -25.25) days" + }, + { + "id":"0hnh4n9e", + "title":"Investigation of three clusters of COVID-19 in Singapore: implications for surveillance and response measures", + "exact_answer":"4 days (IQR 3-6)" + }, + { + "id":"it4ka7v0", + "title":"Estimation of incubation period distribution of COVID-19 using disease onset forward time: a novel cross-sectional and forward follow-up study", + "exact_answer":"median of incubation period is 8·13 days (95% confidence interval [CI]: 7·37-8·91), the mean is 8·62 days (95% CI: 8·02-9·28)" + }, + { + "id":"glq0lckz", + "title":"Clinical Characteristics of SARS-CoV-2 Pneumonia Compared to Controls in Chinese Han Population", + "exact_answer":"4 days (IQR, 2 to 7)" + }, + { + "id":"8anqfkmo", + "title":"The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application", + "exact_answer":"5.1 days (CI, 4.5 to 5.8 days)" + }, + { + "id":"v3gww4iv", + "title":"Transmission of corona virus disease 2019 during the incubation period may lead to a quarantine loophole", + "exact_answer":"4.9 days (95% confidence interval [CI], 4.4 to 5.4) days" + }, + { + "id":"66ulqu11", + "title":"Transmission interval estimates suggest pre-symptomatic spread of COVID-19", + "exact_answer":"7.1 (6.13, 8.25) days for Singapore and 9 (7.92, 10.2) days for Tianjin" + }, + { + "id":"ti9b1etu", + "title":"Transmission and clinical characteristics of coronavirus disease 2019 in 104 outside-Wuhan patients, China", + "exact_answer":"6 days, ranged from 1 to 32 days" + }, + { + "id":"k3f7ohzg", + "title":"Characteristics of COVID-19 infection in Beijing", + "exact_answer":"6.7 days" + }, + { + "id":"jxtch47t", + "title":"Epidemiologic and Clinical Characteristics of 91 Hospitalized Patients with COVID-19 in Zhejiang, China: A retrospective, multi-centre case series", + "exact_answer":"6 (IQR, 3-8) days" + }, + { + "id":"dbzrd23n", + "title":"Title: A descriptive study of the impact of diseases control and prevention on the epidemics 1 dynamics and clinical features of SARS-CoV-2 outbreak in Shanghai, lessons learned for", + "exact_answer":"6.4 days (95% 175 CI 5.3 to 7.6)" + }, + { + "id":"j3avpu1y", + "title":"A familial cluster of pneumonia associated with the 2019 novel coronavirus indicating person-to-person transmission: a study of a family cluster", + "exact_answer":"3-6 days" + }, + { + "id":"1mxjklgx", + "title":"Epidemiological characteristics of 1212 COVID-19 patients in Henan, China. medRxiv", + "exact_answer":"average, mode and median incubation periods are 7.4, 4 and 7 days" + }, + { + "id":"ykofrn9i", + "title":"Incubation Period and Other Epidemiological Characteristics of 2019 Novel Coronavirus Infections with Right Truncation: A Statistical Analysis of Publicly Available Case Data", + "exact_answer":"5.6 days (95% CI: 4.4, 7.4)" + }, + { + "id":"u8goc7io", + "title":"Title: The incubation period of 2019-nCoV infections among travellers from Wuhan, China", + "exact_answer":"6.4 (5.6 -7.7, 95% CI) days" + } + ] + }, + { + "nq_name":"What is the length of viral shedding after illness onset?", + "kq_name":"Length of viral shedding after illness onset", + "answers":[ + { + "id":"r5a46n9a", + "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", + "exact_answer":"2 days (range, 3-38 ) in nasopharyngeal swabs, 19 days (range, 5-37) in sputum and 18 days (range, 7-26) in stools" + }, + { + "id":"k36rymkv", + "title":"Clinical course and risk factors for mortality of adult inpatients with COVID-19 in Wuhan, China: a retrospective cohort study", + "exact_answer":"20·0 days (IQR 17·0-24·0)" + } + ] + }, + { + "nq_name":"What is the incubation period across different age groups?", + "kq_name":"Incubation period across different age groups", + "answers":[ + { + "id":"giabjjnz", + "title":"Children are unlikely to have been the primary source of household SARS-CoV-2 infections", + "exact_answer":"7.74 d ± 3.22" + }, + { + "id":"djq0lvr2", + "title":"Is a 14-day quarantine period optimal for effectively controlling coronavirus disease 2019 (COVID-19)?", + "exact_answer":"median incubation period of both male and female adults was similar (7-day) but significantly shorter than that (9-day) of child cases" + }, + { + "id":"awgyxn3t", + "title":"Clinical Characteristics of 34 Children with Coronavirus Disease-2019 in the West of China: a Multiple-center Case Series", + "exact_answer":"10.50 (7.75 -25.25) days" + } + ] + } + ] + }, + { + "name":"Asymptomatic shedding", + "sub_categories":[ + { + "nq_name":"What is the proportion of patients who were asymptomatic?", + "kq_name":"Proportion of patients who were asymptomatic", + "answers":[ + { + "id":"bmsmegbs", + "title":"A considerable proportion of individuals with asymptomatic SARS-CoV-2 infection in Tibetan population", + "exact_answer":"21.7%" + }, + { + "id":"7w1bhaz6", + "title":"High incidence of asymptomatic SARS-CoV-2 infection, Chongqing, China", + "exact_answer":"19%" + }, + { + "id":"xsqgrd5l", + "title":"High transmissibility of COVID-19 near symptom onset", + "exact_answer":"there were 32 laboratory-confirmed COVID-19 patients, including five household/family clusters and four asymptomatic patients" + }, + { + "id":"56zhxd6e", + "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries", + "exact_answer":"49 (14.89%) were asymptomatic" + }, + { + "id":"atnz63pk", + "title":"Estimating the Asymptomatic Proportion of 2019 Novel Coronavirus onboard the Princess Cruises Ship, 2020", + "exact_answer":"17.9%" + }, + { + "id":"ofoqk100", + "title":"Clinical Characteristics of 24 Asymptomatic Infections with COVID-19 Screened among Close Contacts in Nanjing, China", + "exact_answer":"The remaining 7 (29.2%) cases showed normal CT image and had no symptoms during hospitalization." + }, + { + "id":"k3f7ohzg", + "title":"Characteristics of COVID-19 infection in Beijing", + "exact_answer":"13 (5.0%) asymptomatic cases" + } + ] + }, + { + "nq_name":"What is the proportion of pediatric patients who were asymptomatic?", + "kq_name":"Proportion of pediatric patients who were asymptomatic", + "answers":[ + { + "id":"xsgxd5sy", + "title":"Epidemiological and Clinical Characteristics of Children with Coronavirus Disease 2019", + "exact_answer":"20 (27.03%) cases" + }, + { + "id":"dmrtsxik", + "title":"Articles Clinical and epidemiological features of 36 children with coronavirus disease 2019 (COVID-19) in Zhejiang, China: an observational cohort study", + "exact_answer":"ten (28%) patients" + }, + { + "id":"7w1bhaz6", + "title":"High incidence of asymptomatic SARS-CoV-2 infection, Chongqing, China", + "exact_answer":"28.6%" + }, + { + "id":"jvhrp51s", + "title":"Title: The clinical and epidemiological features and hints of 82 confirmed COVID-19 pediatric cases aged 0-16 in Wuhan, China", + "exact_answer":"8 no symptoms" + }, + { + "id":"j58f1lwa", + "title":"Preliminary epidemiological analysis on children and adolescents with novel coronavirus disease 2019 outside Hubei Province, China: an observational study utilizing crowdsourced data", + "exact_answer":"2 (8.0%) cases" + }, + { + "id":"mar8zt2t", + "title":"The different clinical characteristics of corona virus disease cases between children and their families in China -the character of children with COVID-19", + "exact_answer":"six (66.7%) children" + } + ] + }, + { + "nq_name":"What is the asymptomatic transmission during incubation?", + "kq_name":"Asymptomatic transmission during incubation", + "answers":[ + { + "id":"eflwztji", + "title":"Temporal dynamics in viral shedding and transmissibility of COVID-19", + "exact_answer":"44% of transmission prior to symptom onset" + }, + { + "id":"v3gww4iv", + "title":"Transmission of corona virus disease 2019 during the incubation period may lead to a quarantine loophole", + "exact_answer":"(73.0%) were infected before the symptom onset of the first-generation cases" + }, + { + "id":"56zhxd6e", + "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries Summary Background", + "exact_answer":"In 102 (43.78%) infector-infectee pairs, transmission occurred before infectors' symptom onsets" + }, + { + "id":"v3gww4iv", + "title":"Transmission of corona virus disease 2019 during the incubation period may lead to a quarantine loophole", + "exact_answer":"(73.0%) were infected before the symptom onset of the first-generation cases" + }, + { + "id":"st5vs6gq", + "title":"Title: The serial interval of COVID-19 from publicly reported confirmed cases Running Head: The serial interval of COVID-19", + "exact_answer":"12.6% of reports indicating pre-symptomatic transmission" + } + ] + } + ] + }, + { + "name":"Persistence of sources", + "sub_categories":[ + { + "nq_name":"What is the length of viral shedding in stool?", + "kq_name":"Length of viral shedding in stool", + "answers":[ + { + "id":"k86ljbxu", + "title":"Do children need a longer time to shed SARS- CoV-2 in stool than adults?", + "exact_answer":"100% positive on the third week after onset and 30% positive 29 days later" + }, + { + "id":"ouca1bol", + "title":"Evaluation of SARS-CoV-2 RNA shedding in clinical specimens and clinical characteristics of 10 patients with COVID-19 in Macau", + "exact_answer":"detected in feces till 14 days after the onset of symptoms" + }, + { + "id":"r5a46n9a", + "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", + "exact_answer":"18 days (range, 7-26)" + }, + { + "id":"1fyag5x3", + "title":"Virus shedding patterns in nasopharyngeal and fecal specimens of COVID-19 patients 2 3", + "exact_answer":"22.0 days (IQR 15.5 to 23.5)" + } + ] + }, + { + "nq_name":"What is the length of viral shedding from the nasopharynx?", + "kq_name":"Length of viral shedding from nasopharynx", + "answers":[ + { + "id":"1fyag5x3", + "title":"Virus shedding patterns in nasopharyngeal and fecal specimens of COVID-19 patients 2 3", + "exact_answer":"10.0 days (IQR 8.0 to 17.0)" + }, + { + "id":"r5a46n9a", + "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", + "exact_answer":"12 days (range, 3-38 )" + } + ] + }, + { + "nq_name":"What is the length of viral shedding in urine?", + "kq_name":"Length of viral shedding in urine", + "answers":[ + { + "id":"1fyag5x3", + "title":"Virus shedding patterns in nasopharyngeal and fecal specimens of COVID-19 patients 2 3", + "exact_answer":"all patient data were and urine samples were all negative, except for urine samples from two 53 severe cases at the latest available detection point (16 or 21 d.a.o)" + } + ] + }, + { + "nq_name":"What is the length of viral shedding in blood?", + "kq_name":"Length of viral shedding in blood", + "answers":[ + { + "id":"ac4aesoa", + "title":"Comparisons of nucleic acid conversion time of SARS-CoV-2 of different samples in ICU and non-ICU patients Conversion time of SARS-CoV-2 RT-PCR in ICU and non-ICU patients Letter to the Editor Comparisons of nucleic acid conversion time of SARS-CoV-2 of different samples in ICU and non-ICU patients", + "exact_answer":"10.17 ± 6.134 and 14.63 ± 5.878 days in non-ICU and ICU patients respectively" + } + ] + }, + { + "nq_name":"What is the prevalence of viral shedding in blood?", + "kq_name":"Prevalence of viral shedding in blood", + "answers":[ + { + "id":"r5a46n9a", + "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", + "exact_answer":"12 plasmas (5.7%) from 9 patients (14.3%) were positive" + } + ] + } + ] + }, + { + "name":"Diagnostics", + "sub_categories":[ + { + "nq_name":"What is the sensitivity and specificity of COVID-19 tests?", + "kq_name":"Sensitivity and specificity of COVID-19 tests", + "answers":[ + { + "id":"9p7hqk1u", + "title":"Journal Pre-proof COVID-19 pneumonia: a review of typical CT findings and differential diagnosis COVID-19 pneumonia: a review of typical CT findings and differential diagnosis", + "exact_answer":"A large series based on 1014 patients reported a 97% sensitivity of chest CT for the diagnosis of COVID-19, while the mean time interval between initial negative and positive RT-PCR was approximately 5 days" + }, + { + "id":"aa7slcnc", + "title":"Highly accurate and sensitive diagnostic detection of SARS-CoV-2 by digital PCR", + "exact_answer":"The overall sensitivity, specificity and diagnostic accuracy of RT-dPCR were 90%, 100% and 93 %, respectively" + }, + { + "id":"na8odvj7", + "title":"Serological detection of 2019-nCoV respond to the epidemic: A useful complement to nucleic acid testing", + "exact_answer":"The areas under the ROC curves of IgM and IgG were 0.988 and 1.000, respectively." + }, + { + "id":"py38bel4", + "title":"Clinical significance of IgM and IgG test for diagnosis of highly suspected COVID-19 infection", + "exact_answer":"The positive detection rate of combination of IgM and IgG for patients with COVID-19 negative and positive nucleic acid test was 72.73% and 87.50%." + }, + { + "id":"5skk3nj4", + "title":"Imaging manifestations and diagnostic value of chest CT of coronavirus disease 2019 (COVID-19) in the Xiaogan area", + "exact_answer":"the overall accuracy rate of CT examination in the present study was 97.3%" + }, + { + "id":"cv3qgno3", + "title":"Rapid Molecular Detection of SARS-CoV-2 (COVID-19) Virus RNA Using Colorimetric LAMP", + "exact_answer":"colorimetric LAMP assay showed 100% agreement with the RT-qPCR results across a range of C q values" + }, + { + "id":"chln5r8w", + "title":"Diagnosis of the Coronavirus disease (COVID-19): rRT-PCR or CT?", + "exact_answer":"sensitivity was therefore 97.2%" + }, + { + "id":"s7uqawbd", + "title":"Rapid colorimetric detection of COVID-19 coronavirus using a reverse tran- scriptional loop-mediated isothermal amplification (RT-LAMP) diagnostic plat- form: iLACO", + "exact_answer":"iLACO is very sensitive, and as low as 10 copies of ORF1ab gene were detected successfully" + } + ] + } + ] + }, + { + "name":"Seasonality", + "sub_categories":[ + { + "nq_name":"How does temperature affect the transmission of COVID-19?", + "kq_name":"Effects of temperature on the transmission of COVID-19", + "answers":[ + { + "id":"r141na6j", + "title":"Eco-epidemiological assessment of the COVID-19 epidemic in China", + "exact_answer":"Adjusted incidence rate ratios suggested brighter, warmer and drier conditions were associated with lower incidence" + }, + { + "id":"mdyojac2", + "title":"Climate effect on COVID-19 spread rate: an online surveillance tool", + "exact_answer":"a plausible negative correlation between warmer climate and COVID-19 spread rate" + }, + { + "id":"vc2eheb6", + "title":"Causal empirical estimates suggest COVID-19 transmission rates are highly seasonal", + "exact_answer":"a 1 • C increase in local temperature reduces new COVID-19 cases per 1 million people by 13%, with a 95% confidence interval of [-21%, -4%]" + }, + { + "id":"fcaeoyxd", + "title":"Climate affects global patterns of COVID-19 early outbreak dynamics", + "exact_answer":"Temperature and humidity strongly impact the variation of the growth rate of Covid-19 cases across the globe" + }, + { + "id":"drzphrqj", + "title":"Preliminary evidence that higher temperatures are associated with lower incidence of COVID-19, for cases reported globally up to 29th February 2020", + "exact_answer":"higher average temperature was strongly associated with lower COVID-19 incidence for temperatures of 1°C and higher" + } + ] + }, + { + "nq_name":"How does humidity affect the transmission of COVID-19?", + "kq_name":"Effects of humidity on the transmission of COVID-19", + "answers":[ + { + "id":"6anr4xdw", + "title":"Effects of temperature variation and humidity on the death of COVID-19 in Wuhan, China", + "exact_answer":"both per 1 unit increase of temperature and absolute humidity were related to the decreased COVID-19 death counts" + }, + { + "id":"qz2joxys", + "title":"Role of temperature and humidity in the modulation of the doubling time of COVID-19 cases", + "exact_answer":"doubling time correlates positively with temperature and inversely with humidity, suggesting that a decrease in the rate of progression of COVID-19 with the arrival of spring and summer in the north hemisphere" + }, + { + "id":"fcaeoyxd", + "title":"Title: Climate Affects Global Patterns Of Covid-19 Early Outbreak Dynamics", + "exact_answer":"humidity strongly impact the variation of the growth rate of Covid-19 cases across the globe" + } + ] + }, + { + "nq_name":"How does seasonality affect the transmission of COVID-19?", + "kq_name":"Effects of seasonality on the transmission of COVID-19", + "answers":[ + { + "id":"f4hj35dr", + "title":"Projecting the transmission dynamics of SARS-CoV-2 through the post-pandemic period", + "exact_answer":"Winter/spring establishments favored longer-lasting outbreaks with shorter peaks (Fig 3A) , while autumn/winter establishments led to more acute outbreaks (Fig 3B) " + }, + { + "id":"zxx7tikz", + "title":"Susceptible supply limits the role of climate in the COVID-19 pandemic", + "exact_answer":"summer temperatures will not substantially limit pandemic growth" + }, + { + "id":"f3qeoyvf", + "title":"Title: Modeling the Corona Virus Outbreak in IRAN", + "exact_answer":"available data is too small to identify seasonal patterns and make predictable variation in value" + }, + { + "id":"okvu49y3", + "title":"Application of the ARIMA model on the COVID- 2019 epidemic dataset", + "exact_answer":"prevalence and incidence of COVID-2019 are not influenced by the seasonality" + }, + { + "id":"3p2dl8yf", + "title":"Potential impact of seasonal forcing on a SARS-CoV-2 pandemic", + "exact_answer":"seasonal variation might slow down a pandemic" + }, + { + "id":"x4qdiln9", + "title":"A spatial model of CoVID-19 transmission in England and Wales: early spread and peak timing", + "exact_answer":"Seasonal changes in transmission rate could shift the timing of the peak into winter months" + }, + { + "id":"f4hj35dr", + "title":"Projecting the transmission dynamics of SARS-CoV-2 through the post-pandemic period", + "exact_answer":"HCoV-OC43 and HCoV-HKU1 cause annual wintertime outbreaks of respiratory illness in temperate regions (9, 10), suggesting that wintertime climate and host behaviors may facilitate transmission, as is true for influenza" + }, + { + "id":"nzat41wu", + "title":"Social distancing strategies for curbing the COVID-19 epidemic", + "exact_answer":"If SARS-Cov-2 transmission is similarly subject to seasonal forcing, summer outbreaks would naturally have lower peaks than winter outbreaks" + } + ] + } + ] + }, + { + "name":"Physical science", + "sub_categories":[ + { + "nq_name":"What purity of ethanol inactivates COVID-19?", + "kq_name":"Purity of ethanol to inactivate COVID-19", + "answers":[ + { + "id":"c1n994j6", + "title":"Efficient inactivation of SARS-CoV-2 by WHO-recommended hand rub formulations and alcohols", + "exact_answer":"WHO 21 formulation I, based on 85 % ethanol, efficiently inactivated the virus with reduction 22 factors" + }, + { + "id":"c1n994j6", + "title":"Efficient inactivation of SARS-CoV-2 by WHO-recommended hand rub formulations and alcohols", + "exact_answer":"Both alcohols, ethanol (Fig. 9 2A) and 2-propanol ( Fig. 2B) were able to reduce viral titers in 30 s exposure" + }, + { + "id":"c1n994j6", + "title":"Efficient inactivation of SARS-CoV-2 by WHO-recommended hand rub formulations and alcohols", + "exact_answer":"minimal concentration of 30 % ethanol or 2-propanol is sufficient for viral 12 inactivation" + } + ] + }, + { + "nq_name":"What is the temperature used for inactivating COVID-19?", + "kq_name":"Temperature used for inactivating COVID-19", + "answers":[ + { + "id":"dm1wkpnv", + "title":"Can N95 respirators be reused after disinfection? And for how many times?", + "exact_answer":"heating (dry or in the presence humidity) <100 °C can preserve the filtration characteristics of a pristine N95 respirator" + }, + { + "id":"d48u5w0h", + "title":"Rapid evidence summary on SARS-CoV-2 survivorship and disinfection, and a reusable PPE protocol using a double-hit process", + "exact_answer":"60°C for 90 minutes" + } + ] + }, + { + "nq_name":"What is the UVGI intensity used for inactivating COVID-19?", + "kq_name":"UVGI intensity used for inactivating COVID-19", + "answers":[ + { + "id":"dm1wkpnv", + "title":"Can N95 respirators be reused after disinfection? And for how many times?", + "exact_answer":"(254 nm, 17 mW/cm 2 )" + }, + { + "id":"d48u5w0h", + "title":"Rapid evidence summary on SARS-CoV-2 survivorship and disinfection, and a reusable PPE protocol using a double-hit process", + "exact_answer":"at least 1,000 mJ/cm 2" + } + ] + } + ] + }, + { + "name":"Metaresearch", + "sub_categories":[ + { + "kq_name":"Sample size used in COVID-19 studies", + "nq_name":"How large is the sample size used in COVID-19 studies?", + "answers":[ + { + "id":"wuclekt6", + "title":"Longitudinal analysis of laboratory findings during the process of recovery for patients with COVID-19", + "exact_answer":"59 cases" + }, + { + "id":"ragcpbl6", + "title":"Evolving epidemiology of novel coronavirus diseases 2019 and possible interruption of local transmission outside Hubei Province in China: a descriptive and modeling study", + "exact_answer":"49 cases" + }, + { + "id":"56zhxd6e", + "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries Summary Background", + "exact_answer":"587 cases" + }, + { + "id":"djq0lvr2", + "title":"Is a 14-day quarantine period optimal for effectively controlling coronavirus disease 2019 (COVID-19)?", + "exact_answer":"2015 cases" + }, + { + "id":"0hnh4n9e", + "title":"Investigation of three clusters of COVID-19 in Singapore: implications for surveillance and response measures", + "exact_answer":"36 individuals" + }, + { + "id":"it4ka7v0", + "title":"Estimation of incubation period distribution of COVID-19 using disease onset forward time: a novel cross-sectional and forward follow-up study", + "exact_answer":"sample size of 1211" + }, + { + "id":"tne83uu0", + "title":"Epidemiologic Characteristics of COVID-19 in Guizhou, China Affiliations: Guizhou Center for Disease Control and Prevention Abstract", + "exact_answer":"162 cases" + }, + { + "id":"glq0lckz", + "title":"Clinical Characteristics of SARS-CoV-2 Pneumonia Compared to Controls in Chinese Han Population", + "exact_answer":"69 hospitalized patients" + }, + { + "id":"ti9b1etu", + "title":"Transmission and clinical characteristics of coronavirus disease 2019 in 104 outside-Wuhan patients, China", + "exact_answer":"104 cases" + }, + { + "id":"1mxjklgx", + "title":"Epidemiological characteristics of 1212 COVID-19 patients in Henan, China. medRxiv", + "exact_answer":"1212 patients" + }, + { + "id":"u8goc7io", + "title":"Title: The incubation period of 2019-nCoV infections among travellers from Wuhan, China", + "exact_answer":"88 confirmed cases" + }, + { + "id":"r5a46n9a", + "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", + "exact_answer":"67 patients" + }, + { + "id":"giabjjnz", + "title":"Children are unlikely to have been the primary source of household SARS-CoV-2 infections", + "exact_answer":"40 articles" + }, + { + "id":"bmsmegbs", + "title":"A considerable proportion of individuals with asymptomatic SARS- CoV-2 infection in Tibetan population", + "exact_answer":"83 cases" + }, + { + "id":"56zhxd6e", + "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries Summary Background", + "exact_answer":"329 cases" + }, + { + "id":"atnz63pk", + "title":"Estimating the Asymptomatic Proportion of 2019 Novel Coronavirus onboard the Princess Cruises Ship, 2020", + "exact_answer":"634 cases," + }, + { + "id":"k3f7ohzg", + "title":"Characteristics of COVID-19 infection in Beijing", + "exact_answer":"262 patients" + }, + { + "id":"dmrtsxik", + "title":"Articles Clinical and epidemiological features of 36 children with coronavirus disease 2019 (COVID-19) in Zhejiang, China: an observational cohort study", + "exact_answer":"36 children" + }, + { + "id":"jvhrp51s", + "title":"Title: The clinical and epidemiological features and hints of 82 confirmed COVID-19 pediatric cases aged 0-16 in Wuhan, China", + "exact_answer":"82 confirmed cases" + }, + { + "id":"ouca1bol", + "title":"Evaluation of SARS-CoV-2 RNA shedding in clinical specimens and clinical characteristics of 10 patients with COVID-19 in Macau", + "exact_answer":"10 cases" + }, + { + "id":"ac4aesoa", + "title":"Comparisons of nucleic acid conversion time of SARS-CoV-2 of different samples in ICU and non-ICU patients Conversion time of SARS-CoV-2 RT-PCR in ICU and non-ICU patients Letter to the Editor Comparisons of nucleic acid conversion time of SARS-CoV-2 of different samples in ICU and non-ICU patients", + "exact_answer":"thirty-two patients" + }, + { + "id":"4aps0kvp", + "title":"Molecular and serological investigation of 2019-nCoV infected patients: implication of multiple shedding routes", + "exact_answer":"16 people" + }, + { + "id":"3q2bj7cr", + "title":"Prolonged viral shedding in feces of pediatric patients with coronavirus disease 2019", + "exact_answer":"three paediatric cases" + }, + { + "id":"nrdiqees", + "title":"History of coronary heart disease increases the mortality rate of COVID-19 patients: a nested case-control study", + "exact_answer":"275 publicly reported confirmed cases" + }, + { + "id":"as6tbfrh", + "title":"Title: Clinical characteristics associated with COVID-19 severity in California", + "exact_answer":"54 patients" + }, + { + "id":"5ciaonf0", + "title":"Title: Epidemiological and clinical features of 291 cases with coronavirus disease 2019 in areas adjacent to Hubei, China: a double-center observational study", + "exact_answer":"291 patients" + }, + { + "id":"niw61l9r", + "title":"Neurological Manifestations of Hospitalized Patients with COVID-19 in Wuhan, China: a retrospective case series study", + "exact_answer":"214 patients" + }, + { + "id":"pd70i3d8", + "title":"Neutrophil-to-Lymphocyte Ratio Predicts Severe Illness Patients with 2019 Novel Coronavirus in the Early Stage", + "exact_answer":"61 patients" + } + ] + } + ] + }, + { + "name":"Hypertension", + "sub_categories":[ + { + "nq_name":"What is the RR for severe infection in COVID-19 patients with hypertension?", + "kq_name":"RR for severe infection in COVID-19 patients with hypertension", + "answers":[ + { + "id":"4mnmaky6", + "title":"The relationship of COVID-19 severity with cardiovascular disease and its traditional risk factors: A systematic review and meta-analysis", + "exact_answer":"hypertension (10 studies; 2.74 [2.12-3.54])" + }, + { + "id":"yx8b2moc", + "title":"Incidence, clinical characteristics and prognostic factor of patients with COVID-19: a systematic review and meta-analysis Running title: Predictors of clinical prognosis of COVID-19", + "exact_answer":"hypertension (RR = 4.48; 95% CI" + } + ] + }, + { + "nq_name":"What is the HR for severe infection in COVID-19 patients with hypertension?", + "kq_name":"HR for severe infection in COVID-19 patients with hypertension", + "answers":[ + { + "id":"7s9ot4vq", + "title":"Comorbidity and its impact on 1,590 patients with COVID-19 in China: A Nationwide Analysis", + "exact_answer":"hypertension (HR 1.58, 95%CI 1.07-2.32)" + } + ] + }, + { + "nq_name":"What is the OR for severe infection in COVID-19 patients with hypertension?", + "kq_name":"OR for severe infection in COVID-19 patients with hypertension", + "answers":[ + { + "id":"v6frcc5r", + "title":"Effects of hypertension, diabetes and coronary heart disease on COVID-19 diseases severity: a systematic review and meta-analysis", + "exact_answer":"There were significant correlations between COVID-19 severity and hypertension [OR=2.3 [95% CI (1.76, 3.00), P<0.01]" + }, + { + "id":"vof63qat", + "title":"Systematic review and meta-analysis of predictive symptoms and comorbidities for severe COVID-19 infection", + "exact_answer":"hypertension (1.97, 95% CI 1.40 -2.77)" + }, + { + "id":"o56j4qio", + "title":"Prevalence of comorbidities in the novel Wuhan coronavirus (COVID-19) infection: a systematic review and meta-analysis Prevalence of comorbidities in the Novel Wuhan Coronavirus (COVID-19) infection: a systematic review and meta-analysis", + "exact_answer":"the pooled odds ratio of hypertension, respiratory system disease, cardiovascular disease in severe patients were (OR 2.36, 95% CI: 1.46-3.83) ,(OR 2.46, 95% CI: 1.76-3.44) and (OR 3.42, 95% CI: 1.88-6.22)respectively" + } + ] + }, + { + "nq_name":"What is the mortality rate for COVID-19 patients with hypertension?", + "kq_name":"Mortality rate for COVID-19 patients with hypertension", + "answers":[ + { + "id":"r0r1kvkp", + "title":"Development and external validation of a prognostic multivariable model on admission for hospitalized patients with COVID-19", + "exact_answer":"Patients with hypertension, showed a significantly higher mortality rate (p<0·001)." + } + ] + } + ] + }, + { + "name":"Diabetes", + "sub_categories":[ + { + "nq_name":"What is the OR for severe infection in COVID-19 patients with diabetes?", + "kq_name":"OR for severe infection in COVID-19 patients with diabetes", + "answers":[ + { + "id":"oqo8pa1p", + "title":"Risk factors for severe corona virus disease 2019 (COVID-19) patients : a systematic review and meta analysis", + "exact_answer":"OR = 3.04 [2.01, 4.60]" + }, + { + "id":"v6frcc5r", + "title":"Effects of hypertension, diabetes and coronary heart disease on COVID-19 diseases severity: a systematic review and meta-analysis", + "exact_answer":"[OR=2.67, 95% CI (1.91, 3.74), P<0.01]" + }, + { + "id":"yx8b2moc", + "title":"Incidence, clinical characteristics and prognostic factor of patients with COVID-19: a systematic review and meta-analysis Running title: Predictors of clinical prognosis of COVID-19", + "exact_answer":"2.49; 95% CI, ; n = 10; I 2 = 44%" + }, + { + "id":"o56j4qio", + "title":"Journal Pre-proof Prevalence of comorbidities in the novel Wuhan coronavirus (COVID-19) infection: a systematic review and meta-analysis Prevalence of comorbidities in the Novel Wuhan Coronavirus (COVID-19) infection: a systematic review and meta-analysis", + "exact_answer":"OR 2.07, 95% CI: 0.89-4.82" + }, + { + "id":"o56j4qio", + "title":"Journal Pre-proof Prevalence of comorbidities in the novel Wuhan coronavirus (COVID-19) infection: a systematic review and meta-analysis Prevalence of comorbidities in the Novel Wuhan Coronavirus (COVID-19) infection: a systematic review and meta-analysis", + "exact_answer":"OR 2.07, 95% CI: 0.89-4.82" + } + ] + }, + { + "nq_name":"What is the RR for severe infection in COVID-19 patients with diabetes?", + "kq_name":"RR for severe infection in COVID-19 patients with diabetes", + "answers":[ + { + "id":"4mnmaky6", + "title":"The relationship of COVID-19 severity with cardiovascular disease and its traditional risk factors: A systematic review and meta-analysis", + "exact_answer":"relative risk estimate of 2.81 (2.01-3.93)" + } + ] + }, + { + "nq_name":"What is the HR for death in COVID-19 patients with diabetes?", + "kq_name":"HR for death in COVID-19 patients with diabetes", + "answers":[ + { + "id":"nrdiqees", + "title":"History of coronary heart disease increases the mortality rate of COVID-19 patients: a nested case-control study", + "exact_answer":"HR=1.1, p=0.61" + } + ] + }, + { + "nq_name":"What is the AHR for death in COVID-19 patients with diabetes?", + "kq_name":"AHR for death in COVID-19 patients with diabetes", + "answers":[ + { + "id":"skknfc6h", + "title":"Comorbid Diabetes Mellitus was Associated with Poorer Prognosis in Patients with COVID-19: A Retrospective Cohort Study", + "exact_answer":"[aHR]=3.64; 95% confidence interval [CI]: 1.09, 12.21" + } + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/data/kaggle-lit-review.json b/data/kaggle-lit-review.json deleted file mode 100644 index a1fafc5c..00000000 --- a/data/kaggle-lit-review.json +++ /dev/null @@ -1,459 +0,0 @@ -{ - "categories":[ - { - "name":"Incubation period", - "sub_categories":[ - { - "name":"What is the incubation period of the virus?", - "answers":[ - { - "id":"wuclekt6", - "title":"Longitudinal analysis of laboratory findings during the process of recovery for patients with COVID-19", - "exact_answer":"4 days (IQR, 2-7)" - }, - { - "id":"e3t1f0rt", - "title":"Epidemiological Characteristics of COVID-19; a Systemic Review and Meta-Analysis 1", - "exact_answer":"5.84 (99% CI: 4.83, 6.85) days" - }, - { - "id":"ragcpbl6", - "title":"Evolving epidemiology of novel coronavirus diseases 2019 and possible interruption of local transmission outside Hubei Province in China: a descriptive and modeling study", - "exact_answer":"5·2 days" - }, - { - "id":"n0uwy77g", - "title":"Clinical characteristics and durations of hospitalized patients with COVID-19 in Beijing: a retrospective cohort study", - "exact_answer":"4 (3-7) days" - }, - { - "id":"", - "title":"Early Transmission Dynamics in Wuhan, China, of Novel Coronavirus–Infected Pneumonia", - "exact_answer":"5.2 days (95% confidence interval [CI], 4.1 to 7.0)" - }, - { - "id":"x23ej29m", - "title":"Clinical features and obstetric and neonatal outcomes of pregnant patients with COVID-19 in Wuhan, China: a retrospective, single-centre, descriptive study", - "exact_answer":"5 days (range 2-9 days)" - }, - { - "id":"56zhxd6e", - "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries Summary Background", - "exact_answer":"7.44 days" - }, - { - "id":"zph6r4il", - "title":"Epidemiological, clinical and virological characteristics of 74 cases of coronavirus-infected disease 2019 (COVID-19) with gastrointestinal symptoms", - "exact_answer":"4 days (IQR 3-7 days)" - }, - { - "id":"djq0lvr2", - "title":"Is a 14-day quarantine period optimal for effectively controlling coronavirus disease 2019 (COVID-19)?", - "exact_answer":"The median incubation period of both male and female adults was similar (7-day) but significantly shorter than that (9-day) of child cases" - }, - { - "id":"n0vmb946", - "title":"The difference in the incubation period of 2019 novel coronavirus (SARS-CoV-2) infection between travelers to Hubei and non-travelers: The need of a longer quarantine period", - "exact_answer":"1.8 and 7.2 days" - }, - { - "id":"awgyxn3t", - "title":"Clinical Characteristics of 34 Children with Coronavirus Disease-2019 in the West of China: a Multiple-center Case Series", - "exact_answer":"10.50 (7.75 -25.25) days" - }, - { - "id":"0hnh4n9e", - "title":"Investigation of three clusters of COVID-19 in Singapore: implications for surveillance and response measures", - "exact_answer":"4 days (IQR 3-6)" - }, - { - "id":"it4ka7v0", - "title":"Estimation of incubation period distribution of COVID-19 using disease onset forward time: a novel cross-sectional and forward follow-up study", - "exact_answer":"median of incubation period is 8·13 days (95% confidence interval [CI]: 7·37-8·91), the mean is 8·62 days (95% CI: 8·02-9·28)" - }, - { - "id":"glq0lckz", - "title":"Clinical Characteristics of SARS-CoV-2 Pneumonia Compared to Controls in Chinese Han Population", - "exact_answer":"4 days (IQR, 2 to 7)" - }, - { - "id":"8anqfkmo", - "title":"The Incubation Period of Coronavirus Disease 2019 (COVID-19) From Publicly Reported Confirmed Cases: Estimation and Application", - "exact_answer":"5.1 days (CI, 4.5 to 5.8 days)" - }, - { - "id":"v3gww4iv", - "title":"Transmission of corona virus disease 2019 during the incubation period may lead to a quarantine loophole", - "exact_answer":"4.9 days (95% confidence interval [CI], 4.4 to 5.4) days" - }, - { - "id":"66ulqu11", - "title":"Transmission interval estimates suggest pre-symptomatic spread of COVID-19", - "exact_answer":"7.1 (6.13, 8.25) days for Singapore and 9 (7.92, 10.2) days for Tianjin" - }, - { - "id":"ti9b1etu", - "title":"Transmission and clinical characteristics of coronavirus disease 2019 in 104 outside-Wuhan patients, China", - "exact_answer":"6 days, ranged from 1 to 32 days" - }, - { - "id":"k3f7ohzg", - "title":"Characteristics of COVID-19 infection in Beijing", - "exact_answer":"6.7 days" - }, - { - "id":"jxtch47t", - "title":"Epidemiologic and Clinical Characteristics of 91 Hospitalized Patients with COVID-19 in Zhejiang, China: A retrospective, multi-centre case series", - "exact_answer":"6 (IQR, 3-8) days" - }, - { - "id":"dbzrd23n", - "title":"Title: A descriptive study of the impact of diseases control and prevention on the epidemics 1 dynamics and clinical features of SARS-CoV-2 outbreak in Shanghai, lessons learned for", - "exact_answer":"6.4 days (95% 175 CI 5.3 to 7.6)" - }, - { - "id":"j3avpu1y", - "title":"A familial cluster of pneumonia associated with the 2019 novel coronavirus indicating person-to-person transmission: a study of a family cluster", - "exact_answer":"3-6 days" - }, - { - "id":"1mxjklgx", - "title":"Epidemiological characteristics of 1212 COVID-19 patients in Henan, China. medRxiv", - "exact_answer":"average, mode and median incubation periods are 7.4, 4 and 7 days" - }, - { - "id":"ykofrn9i", - "title":"Incubation Period and Other Epidemiological Characteristics of 2019 Novel Coronavirus Infections with Right Truncation: A Statistical Analysis of Publicly Available Case Data", - "exact_answer":"5.6 days (95% CI: 4.4, 7.4)" - }, - { - "id":"u8goc7io", - "title":"Title: The incubation period of 2019-nCoV infections among travellers from Wuhan, China", - "exact_answer":"6.4 (5.6 -7.7, 95% CI) days" - } - ] - }, - { - "name":"Length of viral shedding after illness onset", - "answers":[ - { - "id":"bg0cw5s6", - "title":"Factors associated with prolonged viral shedding and impact of Lopinavir/Ritonavir treatment in patients with SARS-CoV-2 infection", - "exact_answer":"23 days (IQR, 18-32 days)" - }, - { - "id":"r5a46n9a", - "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", - "exact_answer":"12 (3-38), 19 (5-37), and 18 (7-26) days in nasopharyngeal swabs, sputum and stools, respectively" - }, - { - "id":"k36rymkv", - "title":"Clinical course and risk factors for mortality of adult inpatients with COVID-19 in Wuhan, China: a retrospective cohort study", - "exact_answer":"20·0 days (IQR 17·0–24·0)" - } - ] - }, - { - "name":"Incubation period across different age groups", - "answers":[ - { - "id":"giabjjnz", - "title":"Children are unlikely to have been the primary source of household SARS-CoV-2 infections", - "exact_answer":"7.74 d ± 3.22" - }, - { - "id":"djq0lvr2", - "title":"Is a 14-day quarantine period optimal for effectively controlling coronavirus disease 2019 (COVID-19)?", - "exact_answer":"median incubation period of both male and female adults was similar (7-day) but significantly shorter than that (9-day) of child cases" - }, - { - "id":"awgyxn3t", - "title":"Clinical Characteristics of 34 Children with Coronavirus Disease-2019 in the West of China: a Multiple-center Case Series", - "exact_answer":"10.50 (7.75 -25.25) days" - } - ] - } - ] - }, - { - "name":"Asymptomatic shedding", - "sub_categories":[ - { - "name":"Proportion of patients who were asymptomatic", - "answers":[ - { - "id":"bmsmegbs", - "title":"A considerable proportion of individuals with asymptomatic SARS-CoV-2 infection in Tibetan population", - "exact_answer":"21.7%" - }, - { - "id":"jjgfgqwg", - "title":"Modes of contact and risk of transmission in COVID-19 among close contacts", - "exact_answer":"6.2%" - }, - { - "id":"7w1bhaz6", - "title":"High incidence of asymptomatic SARS-CoV-2 infection, Chongqing, China", - "exact_answer":"19%" - }, - { - "id":"xsqgrd5l", - "title":"High transmissibility of COVID-19 near symptom onset", - "exact_answer":"there were 32 laboratory-confirmed COVID-19 patients, including five household/family clusters and four asymptomatic patients" - }, - { - "id":"6su2x8mk", - "title":"Non-severe vs severe symptomatic COVID-19: 104 cases from the outbreak on the cruise ship “Diamond Princess” in Japan", - "exact_answer":"76 and 28 patients were classified as non-severe (asymptomatic, mild)" - }, - { - "id":"rjm1dqk7", - "title":"Epidemiological characteristics of 2019 novel coronavirus family clustering in Zhejiang Province", - "exact_answer":"54 asymptomatic infected cases" - }, - { - "id":"56zhxd6e", - "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries", - "exact_answer":"49 (14.89%) were asymptomatic" - }, - { - "id":"atnz63pk", - "title":"Estimating the Asymptomatic Proportion of 2019 Novel Coronavirus onboard the Princess Cruises Ship, 2020", - "exact_answer":"17.9%" - }, - { - "id":"ofoqk100", - "title":"Clinical Characteristics of 24 Asymptomatic Infections with COVID-19 Screened among Close Contacts in Nanjing, China", - "exact_answer":"The remaining 7 (29.2%) cases showed normal CT image and had no symptoms during hospitalization." - }, - { - "id":"k3f7ohzg", - "title":"Characteristics of COVID-19 infection in Beijing", - "exact_answer":"13 (5.0%) asymptomatic cases" - }, - { - "id":"f3h74j1n", - "title":"Estimation of the asymptomatic ratio of novel coronavirus infections (COVID-19)", - "exact_answer":"the asymptomatic ratio at 41.6%" - } - ] - }, - { - "name":"Proportion of pediatric patients who were asymptomatic", - "answers":[ - { - "id":"xsgxd5sy", - "title":"Epidemiological and Clinical Characteristics of Children with Coronavirus Disease 2019", - "exact_answer":"20 (27.03%) cases" - }, - { - "id":"dmrtsxik", - "title":"Articles Clinical and epidemiological features of 36 children with coronavirus disease 2019 (COVID-19) in Zhejiang, China: an observational cohort study", - "exact_answer":"ten (28%) patients" - }, - { - "id":"7w1bhaz6", - "title":"High incidence of asymptomatic SARS-CoV-2 infection, Chongqing, China", - "exact_answer":"(28.6%) in children group under 14, next in elder group over 70 (27.3%)" - }, - { - "id":"jvhrp51s", - "title":"Title: The clinical and epidemiological features and hints of 82 confirmed COVID-19 pediatric cases aged 0-16 in Wuhan, China", - "exact_answer":"8 (9.76%)" - }, - { - "id":"j58f1lwa", - "title":"Preliminary epidemiological analysis on children and adolescents with novel coronavirus disease 2019 outside Hubei Province, China: an observational study utilizing crowdsourced data", - "exact_answer":"2 (8.0%) cases" - }, - { - "id":"mar8zt2t", - "title":"The different clinical characteristics of corona virus disease cases between children and their families in China -the character of children with COVID-19", - "exact_answer":"six (66.7%) children" - } - ] - }, - { - "name":"Asymptomatic transmission during incubation", - "answers":[ - { - "id":"eflwztji", - "title":"Temporal dynamics in viral shedding and transmissibility of COVID-19", - "exact_answer":"44% of transmission prior to symptom onset" - }, - { - "id":"v3gww4iv", - "title":"Transmission of corona virus disease 2019 during the incubation period may lead to a quarantine loophole", - "exact_answer":"(73.0%) were infected before the symptom onset of the first-generation cases" - }, - { - "id":"56zhxd6e", - "title":"Epidemiological parameters of coronavirus disease 2019: a pooled analysis of publicly reported individual data of 1155 cases from seven countries Summary Background", - "exact_answer":"In 102 (43.78%) infector-infectee pairs, transmission occurred before infectors' symptom onsets" - }, - { - "id":"v3gww4iv", - "title":"Transmission of corona virus disease 2019 during the incubation period may lead to a quarantine loophole", - "exact_answer":"(73.0%) were infected before the symptom onset of the first-generation cases" - }, - { - "id":"st5vs6gq", - "title":"Title: The serial interval of COVID-19 from publicly reported confirmed cases Running Head: The serial interval of COVID-19", - "exact_answer":"12.6% of reports indicating pre-symptomatic transmission" - } - ] - } - ] - }, - { - "name":"Persistence of sources", - "sub_categories":[ - { - "name":"Length of viral shedding in stool", - "answers":[ - { - "id":"k86ljbxu", - "title":"Do children need a longer time to shed SARS- CoV-2 in stool than adults?", - "exact_answer":"100% positive on the third week after onset and 30% positive 29 days later" - }, - { - "id":"ouca1bol", - "title":"Evaluation of SARS-CoV-2 RNA shedding in clinical specimens and clinical characteristics of 10 patients with COVID-19 in Macau", - "exact_answer":"detected in feces till 14 days after the onset of symptoms" - }, - { - "id":"r5a46n9a", - "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", - "exact_answer":"18 days (range, 7-26)" - }, - { - "id":"1fyag5x3", - "title":"Virus shedding patterns in nasopharyngeal and fecal specimens of COVID-19 patients 2 3", - "exact_answer":"22.0 days (IQR 15.5 to 23.5)" - } - ] - }, - { - "name":"Length of viral shedding from nasopharynx", - "answers":[ - { - "id":"1fyag5x3", - "title":"Virus shedding patterns in nasopharyngeal and fecal specimens of COVID-19 patients 2 3", - "exact_answer":"10.0 days (IQR 8.0 to 17.0)" - }, - { - "id":"r5a46n9a", - "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", - "exact_answer":"12 days (range, 3-38 )" - } - ] - }, - { - "name":"Length of viral shedding in urine", - "answers":[ - { - "id":"1fyag5x3", - "title":"Virus shedding patterns in nasopharyngeal and fecal specimens of COVID-19 patients 2 3", - "exact_answer":"all patient data were and urine samples were all negative, except for urine samples from two 53 severe cases at the latest available detection point (16 or 21 d.a.o)" - } - ] - }, - { - "name":"Length of viral shedding in blood", - "answers":[ - { - "id":"ac4aesoa", - "title":"Comparisons of nucleic acid conversion time of SARS-CoV-2 of different samples in ICU and non-ICU patients Conversion time of SARS-CoV-2 RT-PCR in ICU and non-ICU patients Letter to the Editor Comparisons of nucleic acid conversion time of SARS-CoV-2 of different samples in ICU and non-ICU patients", - "exact_answer":"10.17 ± 6.134 and 14.63 ± 5.878 days in non-ICU and ICU patients respectively" - } - ] - }, - { - "name":"Prevalence of viral shedding in blood", - "answers":[ - { - "id":"r5a46n9a", - "title":"Viral Kinetics and Antibody Responses in Patients with COVID-19", - "exact_answer":"12 plasmas (5.7%) from 9 patients (14.3%) were positive" - } - ] - } - ] - }, - { - "name":"Diagnostics", - "sub_categories":[ - { - "name":"Sensitivity and specificity of COVID-19 tests", - "answers":[ - { - "id":"9p7hqk1u", - "title":"Journal Pre-proof COVID-19 pneumonia: a review of typical CT findings and differential diagnosis COVID-19 pneumonia: a review of typical CT findings and differential diagnosis", - "exact_answer":"A large series based on 1014 patients reported a 97% sensitivity of chest CT for the diagnosis of COVID-19, while the mean time interval between initial negative and positive RT-PCR was approximately 5 days" - }, - { - "id":"aa7slcnc", - "title":"Highly accurate and sensitive diagnostic detection of SARS-CoV-2 by digital PCR", - "exact_answer":"The overall sensitivity, specificity and diagnostic accuracy of RT-dPCR were 90%, 100% and 93 %, respectively" - }, - { - "id":"na8odvj7", - "title":"Serological detection of 2019-nCoV respond to the epidemic: A useful complement to nucleic acid testing", - "exact_answer":"The areas under the ROC curves of IgM and IgG were 0.988 and 1.000, respectively." - }, - { - "id":"", - "title":"Molecular immune pathogenesis and diagnosis of COVID-19", - "exact_answer":"RT-qPCR can only achieve a sensitivity of 50% to 79%, depending on the protocol used the sample type and number of clinical specimens collected" - }, - { - "id":"", - "title":"Molecular immune pathogenesis and diagnosis of COVID-19", - "exact_answer":"The sensitivity of SARS-CoV N-based IgG ELISA (94.7%) is significantly higher than that of SARS-CoV S-based IgG ELISA (58.9%)" - }, - { - "id":"py38bel4", - "title":"Clinical significance of IgM and IgG test for diagnosis of highly suspected COVID-19 infection", - "exact_answer":"The positive detection rate of combination of IgM and IgG for patients with COVID-19 negative and positive nucleic acid test was 72.73% and 87.50%." - }, - { - "id":"5skk3nj4", - "title":"Imaging manifestations and diagnostic value of chest CT of coronavirus disease 2019 (COVID-19) in the Xiaogan area", - "exact_answer":"the overall accuracy rate of CT examination in the present study was 97.3%" - }, - { - "id":"cv3qgno3", - "title":"Rapid Molecular Detection of SARS-CoV-2 (COVID-19) Virus RNA Using Colorimetric LAMP", - "exact_answer":"colorimetric LAMP assay showed 100% agreement with the RT-qPCR results across a range of C q values" - }, - { - "id":"", - "title":"Differential diagnosis of illness in patients under investigation for the novel coronavirus (SARS-CoV-2), Italy, February 2020", - "exact_answer":"Broad screening for respiratory pathogens revealed a high rate of influenza virus infections, accounting for 28.5% of all suspected cases of SARS-CoV-2 infection" - }, - { - "id":"8gncbgot", - "title":"Potential Rapid Diagnostics, Vaccine and Therapeutics for 2019 Novel Coronavirus (2019-nCoV): A Systematic Review", - "exact_answer":"E gene and RdRp gene assays produced the best result (5.2 and 3.8 copies per reaction at 95% detection probability, respectively)" - }, - { - "id":"chln5r8w", - "title":"Diagnosis of the Coronavirus disease (COVID-19): rRT-PCR or CT?", - "exact_answer":"Sensitivity of CT examinations was 97.2% at presentation, whereas first round rRT-PCR sensitivity was 84.6%" - }, - { - "id":"s7uqawbd", - "title":"Rapid colorimetric detection of COVID-19 coronavirus using a reverse tran- scriptional loop-mediated isothermal amplification (RT-LAMP) diagnostic plat- form: iLACO", - "exact_answer":"iLACO is very sensitive, and as low as 10 copies of ORF1ab gene were detected successfully" - }, - { - "id":"", - "title":"Detection of 2019 novel coronavirus (2019-nCoV) by real-time RT-PCR", - "exact_answer":"All assays were highly sensitive, with best results obtained for the E gene and RdRp gene assays (5.2 and 3.8 copies per reaction at 95% detection probability, respectively)" - } - ] - } - ] - } - ] -} \ No newline at end of file diff --git a/pygaggle/data/kaggle.py b/pygaggle/data/kaggle.py index 46fbd17d..cbadd1fb 100644 --- a/pygaggle/data/kaggle.py +++ b/pygaggle/data/kaggle.py @@ -1,13 +1,15 @@ -from collections import OrderedDict +from collections import OrderedDict, defaultdict from typing import List import json import logging from pydantic import BaseModel +import scipy.special as sp +import numpy as np -from .relevance import RelevanceExample, LuceneDocumentLoader +from .relevance import RelevanceExample, Cord19DocumentLoader from pygaggle.model.tokenize import SpacySenticizer -from pygaggle.rerank import Query, Text +from pygaggle.rerank.base import Query, Text __all__ = ['MISSING_ID', 'LitReviewCategory', 'LitReviewAnswer', 'LitReviewDataset', 'LitReviewSubcategory'] @@ -23,7 +25,8 @@ class LitReviewAnswer(BaseModel): class LitReviewSubcategory(BaseModel): - name: str + nq_name: str + kq_name: str answers: List[LitReviewAnswer] @@ -33,6 +36,7 @@ class LitReviewCategory(BaseModel): class LitReviewDataset(BaseModel): + version: str categories: List[LitReviewCategory] @classmethod @@ -40,30 +44,51 @@ def from_file(cls, filename: str) -> 'LitReviewDataset': with open(filename) as f: return cls(**json.load(f)) - @property - def query_answer_pairs(self): - return ((subcat.name, ans) for cat in self.categories + def query_answer_pairs(self, split: str = 'nq'): + return ((subcat.nq_name if split == 'nq' else subcat.kq_name, ans) for cat in self.categories for subcat in cat.sub_categories for ans in subcat.answers) - def to_senticized_dataset(self, index_path: str) -> List[RelevanceExample]: - loader = LuceneDocumentLoader(index_path) + def to_senticized_dataset(self, + index_path: str, + split: str = 'nq') -> List[RelevanceExample]: + loader = Cord19DocumentLoader(index_path) tokenizer = SpacySenticizer() example_map = OrderedDict() rel_map = OrderedDict() - for query, document in self.query_answer_pairs: + for query, document in self.query_answer_pairs(split=split): if document.id == MISSING_ID: logging.warning(f'Skipping {document.title} (missing ID)') continue key = (query, document.id) - example_map.setdefault(key, tokenizer(loader.load_document(document.id))) + try: + doc = loader.load_document(document.id) + example_map.setdefault(key, tokenizer(doc.all_text)) + except ValueError as e: + logging.warning(f'Skipping {document.id} ({e})') + continue sents = example_map[key] rel_map.setdefault(key, [False] * len(sents)) for idx, s in enumerate(sents): if document.exact_answer in s: rel_map[key][idx] = True + mean_stats = defaultdict(list) for (_, doc_id), rels in rel_map.items(): + int_rels = np.array(list(map(int, rels))) + p = int_rels.sum() + mean_stats['Average spans'].append(p) + mean_stats['Random P@1'].append(np.mean(int_rels)) + n = len(int_rels) - p + N = len(int_rels) + mean_stats['Random R@3'].append(1 - (n * (n - 1) * (n - 2)) / (N * (N - 1) * (N - 2))) + numer = np.array([sp.comb(n, i) / (N - i) for i in range(0, n + 1)]) * p + denom = np.array([sp.comb(N, i) for i in range(0, n + 1)]) + rr = 1 / np.arange(1, n + 2) + rmrr = np.sum(numer * rr / denom) + mean_stats['Random MRR'].append(rmrr) if not any(rels): logging.warning(f'{doc_id} has no relevant answers') + for k, v in mean_stats.items(): + logging.info(f'{k}: {np.mean(v)}') return [RelevanceExample(Query(query), list(map(lambda s: Text(s, dict(docid=docid)), sents)), rels) for ((query, docid), sents), (_, rels) in zip(example_map.items(), rel_map.items())] diff --git a/pygaggle/data/relevance.py b/pygaggle/data/relevance.py index b53f5982..a0e65620 100644 --- a/pygaggle/data/relevance.py +++ b/pygaggle/data/relevance.py @@ -1,16 +1,15 @@ from dataclasses import dataclass from functools import lru_cache -from itertools import chain from typing import List import json import re from pyserini.search import pysearch -from pygaggle.rerank import Query, Text +from pygaggle.rerank.base import Query, Text -__all__ = ['RelevanceExample', 'LuceneDocumentLoader'] +__all__ = ['RelevanceExample', 'Cord19DocumentLoader'] @dataclass @@ -20,15 +19,34 @@ class RelevanceExample: labels: List[bool] -class LuceneDocumentLoader: +@dataclass +class Cord19Document: + abstract: str + body_text: str + ref_entries: str + + @property + def all_text(self): + return '\n'.join((self.abstract, self.body_text, self.ref_entries)) + + +class Cord19DocumentLoader: double_space_pattern = re.compile(r'\s\s+') def __init__(self, index_path: str): self.searcher = pysearch.SimpleSearcher(index_path) @lru_cache(maxsize=1024) - def load_document(self, id: str) -> str: - article = json.loads(self.searcher.doc(id).lucene_document().get('raw')) + def load_document(self, id: str) -> Cord19Document: + def unfold(entries): + return '\n'.join(x['text'] for x in entries) + try: + article = json.loads(self.searcher.doc(id).lucene_document().get('raw')) + except json.decoder.JSONDecodeError: + raise ValueError('article not found') + except AttributeError: + raise ValueError('document unretrievable') ref_entries = article['ref_entries'].values() - text = '\n'.join(x['text'] for x in chain(article['abstract'], article['body_text'], ref_entries)) - return text + return Cord19Document(unfold(article['abstract']), + unfold(article['body_text']), + unfold(ref_entries)) diff --git a/pygaggle/model/encode.py b/pygaggle/model/encode.py index ddfc9be4..9b19d3e8 100644 --- a/pygaggle/model/encode.py +++ b/pygaggle/model/encode.py @@ -6,7 +6,7 @@ import torch.nn as nn from .tokenize import BatchTokenizer -from pygaggle.rerank import TextType +from pygaggle.rerank.base import TextType __all__ = ['LongBatchEncoder', 'EncoderOutputBatch', 'SingleEncoderOutput', 'SpecialTokensCleaner'] diff --git a/pygaggle/model/evaluate.py b/pygaggle/model/evaluate.py index e0a39906..3c355bbb 100644 --- a/pygaggle/model/evaluate.py +++ b/pygaggle/model/evaluate.py @@ -6,8 +6,8 @@ from tqdm import tqdm import numpy as np -from pygaggle.data import RelevanceExample -from pygaggle.rerank import Reranker +from pygaggle.data.kaggle import RelevanceExample +from pygaggle.rerank.base import Reranker __all__ = ['RerankerEvaluator', 'metric_names'] @@ -34,6 +34,11 @@ def value(self): return np.mean(self.scores) +class TruncatingMixin: + def truncated_rels(self, scores: List[float]) -> np.ndarray: + return np.array(scores) + + def register_metric(name): def wrap_fn(metric_cls): METRIC_MAP[name] = metric_cls @@ -46,43 +51,69 @@ def metric_names(): return list(METRIC_MAP.keys()) -def truncated_rels(scores: List[float], top_k: int) -> np.ndarray: - rel_idxs = sorted(list(enumerate(scores)), key=lambda x: x[1], reverse=True)[:top_k] - rel_idxs = [x[0] for x in rel_idxs] - score_rels = np.zeros(len(scores), dtype=int) - score_rels[rel_idxs] = 1 - return score_rels +class TopkMixin(TruncatingMixin): + top_k: int = None + + def truncated_rels(self, scores: List[float]) -> np.ndarray: + rel_idxs = sorted(list(enumerate(scores)), key=lambda x: x[1], reverse=True)[self.top_k:] + scores = np.array(scores) + scores[[x[0] for x in rel_idxs]] = 0 + return scores + +class DynamicThresholdingMixin(TruncatingMixin): + threshold: float = 0.5 -@register_metric('recall') -class RecallAccumulator(MeanAccumulator): - top_k = None + def truncated_rels(self, scores: List[float]) -> np.ndarray: + scores = np.array(scores) + scores[scores < self.threshold * np.max(scores)] = 0 + return scores + +class RecallAccumulator(TruncatingMixin, MeanAccumulator): def accumulate(self, scores: List[float], gold: RelevanceExample): - score_rels = truncated_rels(scores, self.top_k) + score_rels = self.truncated_rels(scores) + score_rels[score_rels != 0] = 1 gold_rels = np.array(gold.labels, dtype=int) score = recall_score(gold_rels, score_rels, zero_division=1) self.scores.append(score) -@register_metric('precision') -class PrecisionAccumulator(MeanAccumulator): - top_k = None - +class PrecisionAccumulator(TruncatingMixin, MeanAccumulator): def accumulate(self, scores: List[float], gold: RelevanceExample): - score_rels = truncated_rels(scores, self.top_k) + score_rels = self.truncated_rels(scores) + score_rels[score_rels != 0] = 1 + score_rels = score_rels.astype(int) gold_rels = np.array(gold.labels, dtype=int) - self.scores.append((score_rels & gold_rels).sum() / score_rels.sum()) + sum_score = score_rels.sum() + if sum_score > 0: + self.scores.append((score_rels & gold_rels).sum() / sum_score) -@register_metric('recall@1') -class RecallAt1Metric(RecallAccumulator): +@register_metric('precision@1') +class PrecisionAt1Metric(TopkMixin, PrecisionAccumulator): top_k = 1 -@register_metric('precision@1') -class PrecisionAt1Metric(PrecisionAccumulator): - top_k = 1 +@register_metric('recall@3') +class RecallAt3Metric(TopkMixin, RecallAccumulator): + top_k = 3 + + +@register_metric('mrr') +class MrrMetric(MeanAccumulator): + def accumulate(self, scores: List[float], gold: RelevanceExample): + scores = sorted(list(enumerate(scores)), key=lambda x: x[1], reverse=True) + rr = next((1 / (rank_idx + 1) for rank_idx, (idx, _) in enumerate(scores) if gold.labels[idx]), 0) + self.scores.append(rr) + + +class ThresholdedRecallMetric(DynamicThresholdingMixin, RecallAccumulator): + threshold = 0.5 + + +class ThresholdedPrecisionMetric(DynamicThresholdingMixin, PrecisionAccumulator): + threshold = 0.5 class RerankerEvaluator: diff --git a/pygaggle/model/tokenize.py b/pygaggle/model/tokenize.py index 4a23dca3..b6c2015d 100644 --- a/pygaggle/model/tokenize.py +++ b/pygaggle/model/tokenize.py @@ -6,7 +6,7 @@ from transformers import PreTrainedTokenizer import torch -from pygaggle.rerank import Query, Text, TextType +from pygaggle.rerank.base import Query, Text, TextType __all__ = ['BatchTokenizer', diff --git a/pygaggle/rerank/__init__.py b/pygaggle/rerank/__init__.py index cb01d169..e69de29b 100644 --- a/pygaggle/rerank/__init__.py +++ b/pygaggle/rerank/__init__.py @@ -1,4 +0,0 @@ -from .base import * -from .similarity import * -from .bm25 import * -from .transformer import * diff --git a/pygaggle/rerank/bm25.py b/pygaggle/rerank/bm25.py index 8460305d..da0b296f 100644 --- a/pygaggle/rerank/bm25.py +++ b/pygaggle/rerank/bm25.py @@ -7,7 +7,7 @@ from pyserini.index.pyutils import IndexReaderUtils import numpy as np -from pygaggle.rerank import Reranker, Query, Text +from .base import Reranker, Query, Text __all__ = ['Bm25Reranker'] @@ -45,5 +45,7 @@ def rerank(self, query: Query, texts: List[Text]) -> List[Text]: idfs = {w: self.index_utils.compute_bm25_term_weight(text.raw['docid'], w) for w in tf} score = sum(idfs[w] * tf[w] * (self.k1 + 1) / (tf[w] + self.k1 * (1 - self.b + self.b * (d_len / mean_len))) for w in tf) + if np.isnan(score): + score = 0 text.score = score return texts diff --git a/pygaggle/rerank/random.py b/pygaggle/rerank/random.py new file mode 100644 index 00000000..aca01742 --- /dev/null +++ b/pygaggle/rerank/random.py @@ -0,0 +1,19 @@ +from copy import deepcopy +from typing import List +import random + +from .base import Query, Text, Reranker + + +__all__ = ['RandomReranker'] + + +class RandomReranker(Reranker): + def __init__(self, seed: int = 0): + self.rand = random.Random(seed) + + def rerank(self, query: Query, texts: List[Text]) -> List[Text]: + texts = deepcopy(texts) + for text in texts: + text.score = self.rand.random() + return texts diff --git a/pygaggle/rerank/similarity.py b/pygaggle/rerank/similarity.py index 729232f0..abd969be 100644 --- a/pygaggle/rerank/similarity.py +++ b/pygaggle/rerank/similarity.py @@ -2,10 +2,10 @@ import torch -from pygaggle.model import SingleEncoderOutput +from pygaggle.model.encode import SingleEncoderOutput -__all__ = ['SimilarityMatrixProvider', 'InnerProductMatrixProvider'] +__all__ = ['SimilarityMatrixProvider', 'CosineSimilarityMatrixProvider'] class SimilarityMatrixProvider: @@ -16,7 +16,7 @@ def compute_matrix(self, pass -class InnerProductMatrixProvider(SimilarityMatrixProvider): +class CosineSimilarityMatrixProvider(SimilarityMatrixProvider): @torch.no_grad() def compute_matrix(self, encoded_query: SingleEncoderOutput, encoded_document: SingleEncoderOutput) -> torch.Tensor: query_repr = encoded_query.encoder_output diff --git a/pygaggle/rerank/transformer.py b/pygaggle/rerank/transformer.py index 0314ed19..419c2c41 100644 --- a/pygaggle/rerank/transformer.py +++ b/pygaggle/rerank/transformer.py @@ -1,15 +1,19 @@ from copy import deepcopy from typing import List -from transformers import T5ForConditionalGeneration, PreTrainedModel +from transformers import T5ForConditionalGeneration, PreTrainedModel, PreTrainedTokenizer, BertForQuestionAnswering import torch +from .base import Reranker, Query, Text +from .similarity import SimilarityMatrixProvider from pygaggle.model import greedy_decode, QueryDocumentBatchTokenizer, BatchTokenizer,\ QueryDocumentBatch, LongBatchEncoder, SpecialTokensCleaner -from pygaggle.rerank import Reranker, Query, Text, SimilarityMatrixProvider -__all__ = ['T5Reranker', 'TransformerReranker'] +__all__ = ['T5Reranker', + 'UnsupervisedTransformerReranker', + 'SequenceClassificationTransformerReranker', + 'QuestionAnsweringTransformerReranker'] class T5Reranker(Reranker): @@ -32,14 +36,14 @@ def rerank(self, query: Query, texts: List[Text]) -> List[Text]: # 6136 and 1176 are the indexes of the tokens false and true in T5. batch_scores = batch_scores[:, [6136, 1176]] - batch_log_probs = torch.nn.functional.log_softmax(batch_scores, dim=1) - batch_log_probs = batch_log_probs[:, 1].tolist() + batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1) + batch_log_probs = batch_scores[:, 1].tolist() for doc, score in zip(batch.documents, batch_log_probs): doc.score = score return texts -class TransformerReranker(Reranker): +class UnsupervisedTransformerReranker(Reranker): methods = dict(max=lambda x: x.max().item(), mean=lambda x: x.mean().item(), absmean=lambda x: x.abs().mean().item(), @@ -50,7 +54,8 @@ def __init__(self, tokenizer: BatchTokenizer, sim_matrix_provider: SimilarityMatrixProvider, method: str = 'max', - clean_special: bool = True): + clean_special: bool = True, + argmax_only: bool = False): assert method in self.methods, 'inappropriate scoring method' self.model = model self.tokenizer = tokenizer @@ -60,16 +65,78 @@ def __init__(self, self.clean_special = clean_special self.cleaner = SpecialTokensCleaner(tokenizer.tokenizer) self.device = next(self.model.parameters(), None).device + self.argmax_only = argmax_only @torch.no_grad() def rerank(self, query: Query, texts: List[Text]) -> List[Text]: encoded_query = self.encoder.encode_single(query) encoded_documents = self.encoder.encode(texts) texts = deepcopy(texts) + max_score = None for enc_doc, text in zip(encoded_documents, texts): if self.clean_special: enc_doc = self.cleaner.clean(enc_doc) matrix = self.sim_matrix_provider.compute_matrix(encoded_query, enc_doc) - score = self.methods[self.method](matrix) + score = self.methods[self.method](matrix) if matrix.size(1) > 0 else -10000 text.score = score + max_score = score if max_score is None else max(max_score, score) + if self.argmax_only: + for text in texts: + if text.score != max_score: + text.score = max_score - 10000 + return texts + + +class SequenceClassificationTransformerReranker(Reranker): + def __init__(self, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizer): + self.tokenizer = tokenizer + self.model = model + self.device = next(model.parameters()).device + + @torch.no_grad() + def rerank(self, query: Query, texts: List[Text]) -> List[Text]: + texts = deepcopy(texts) + for text in texts: + ret = self.tokenizer.encode_plus(query.text, + text.text, + max_length=512, + return_token_type_ids=True, + return_tensors='pt') + input_ids = ret['input_ids'].to(self.device) + tt_ids = ret['token_type_ids'].to(self.device) + output, = self.model(input_ids, token_type_ids=tt_ids) + if output.size(1) > 1: + text.score = torch.nn.functional.log_softmax(output, 1)[0, -1].item() + else: + text.score = output.item() + return texts + + +class QuestionAnsweringTransformerReranker(Reranker): + def __init__(self, model: PreTrainedModel, tokenizer: PreTrainedTokenizer): + self.tokenizer = tokenizer + self.model = model + self.device = next(model.parameters()).device + + @torch.no_grad() + def rerank(self, query: Query, texts: List[Text]) -> List[Text]: + texts = deepcopy(texts) + for text in texts: + ret = self.tokenizer.encode_plus(query.text, + text.text, + max_length=512, + return_tensors='pt', + return_token_type_ids=True) + input_ids = ret['input_ids'].to(self.device) + tt_ids = ret['token_type_ids'].to(self.device) + start_scores, end_scores = self.model(input_ids, token_type_ids=tt_ids) + start_scores = start_scores[0] + end_scores = end_scores[0] + start_scores[(1 - tt_ids[0]).bool()] = -5000 + end_scores[(1 - tt_ids[0]).bool()] = -5000 + smax_val, smax_idx = start_scores.max(0) + emax_val, emax_idx = end_scores.max(0) + text.score = max(smax_val.item(), emax_val.item()) return texts diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index 52d6505d..9810cd79 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -3,18 +3,24 @@ import logging from pydantic import BaseModel, validator -from transformers import AutoModel, AutoTokenizer +from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, BertForQuestionAnswering, \ + BertForSequenceClassification import torch from .args import ArgumentParserBuilder, opt -from pygaggle.rerank import TransformerReranker, InnerProductMatrixProvider, Reranker, T5Reranker, Bm25Reranker +from pygaggle.rerank.base import Reranker +from pygaggle.rerank.bm25 import Bm25Reranker +from pygaggle.rerank.transformer import UnsupervisedTransformerReranker, T5Reranker, \ + SequenceClassificationTransformerReranker, QuestionAnsweringTransformerReranker +from pygaggle.rerank.random import RandomReranker +from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider from pygaggle.model import SimpleBatchTokenizer, CachedT5ModelLoader, T5BatchTokenizer, RerankerEvaluator, metric_names from pygaggle.data import LitReviewDataset from pygaggle.settings import Settings SETTINGS = Settings() -METHOD_CHOICES = ('transformer', 'bm25', 't5') +METHOD_CHOICES = ('transformer', 'bm25', 't5', 'seq_class_transformer', 'qa_transformer', 'random') class KaggleEvaluationOptions(BaseModel): @@ -22,8 +28,11 @@ class KaggleEvaluationOptions(BaseModel): method: str batch_size: int device: str + split: str + do_lower_case: bool metrics: List[str] model_name: Optional[str] + tokenizer_name: Optional[str] @validator('dataset') def dataset_exists(cls, v: Path): @@ -41,6 +50,12 @@ def model_name_sane(cls, v: Optional[str], values, **kwargs): return 'monologg/biobert_v1.1_pubmed' return v + @validator('tokenizer_name') + def tokenizer_sane(cls, v: str, values, **kwargs): + if v is None: + return values['model_name'] + return v + def construct_t5(options: KaggleEvaluationOptions) -> Reranker: loader = CachedT5ModelLoader(SETTINGS.t5_model_dir, @@ -50,17 +65,56 @@ def construct_t5(options: KaggleEvaluationOptions) -> Reranker: SETTINGS.flush_cache) device = torch.device(options.device) model = loader.load().to(device).eval() - tokenizer = AutoTokenizer.from_pretrained(options.model_name) + tokenizer = AutoTokenizer.from_pretrained(options.model_name, do_lower_case=options.do_lower_case) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) return T5Reranker(model, tokenizer) def construct_transformer(options: KaggleEvaluationOptions) -> Reranker: device = torch.device(options.device) - model = AutoModel.from_pretrained(options.model_name).to(device).eval() - tokenizer = SimpleBatchTokenizer(AutoTokenizer.from_pretrained(options.model_name), options.batch_size) - provider = InnerProductMatrixProvider() - return TransformerReranker(model, tokenizer, provider) + try: + model = AutoModel.from_pretrained(options.model_name).to(device).eval() + except OSError: + model = AutoModel.from_pretrained(options.model_name, from_tf=True).to(device).eval() + tokenizer = SimpleBatchTokenizer(AutoTokenizer.from_pretrained(options.tokenizer_name, + do_lower_case=options.do_lower_case), + options.batch_size) + provider = CosineSimilarityMatrixProvider() + return UnsupervisedTransformerReranker(model, tokenizer, provider) + + +def construct_seq_class_transformer(options: KaggleEvaluationOptions) -> Reranker: + try: + model = AutoModelForSequenceClassification.from_pretrained(options.model_name) + except OSError: + try: + model = AutoModelForSequenceClassification.from_pretrained(options.model_name, from_tf=True) + except AttributeError: + # Hotfix for BioBERT MS MARCO. Refactor. + BertForSequenceClassification.bias = torch.nn.Parameter(torch.zeros(2)) + BertForSequenceClassification.weight = torch.nn.Parameter(torch.zeros(2, 768)) + model = BertForSequenceClassification.from_pretrained(options.model_name, from_tf=True) + model.classifier.weight = BertForSequenceClassification.weight + model.classifier.bias = BertForSequenceClassification.bias + device = torch.device(options.device) + model = model.to(device).eval() + tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name, do_lower_case=options.do_lower_case) + return SequenceClassificationTransformerReranker(model, tokenizer) + + +def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: + # We load a sequence classification model first -- again, as a workaround. Refactor. + try: + model = AutoModelForSequenceClassification.from_pretrained(options.model_name) + except OSError: + model = AutoModelForSequenceClassification.from_pretrained(options.model_name, from_tf=True) + fixed_model = BertForQuestionAnswering(model.config) + fixed_model.qa_outputs = model.classifier + fixed_model.bert = model.bert + device = torch.device(options.device) + model = fixed_model.to(device).eval() + tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name, do_lower_case=options.do_lower_case) + return QuestionAnsweringTransformerReranker(model, tokenizer) def construct_bm25(_: KaggleEvaluationOptions) -> Reranker: @@ -69,25 +123,32 @@ def construct_bm25(_: KaggleEvaluationOptions) -> Reranker: def main(): apb = ArgumentParserBuilder() - apb.add_opts(opt('--dataset', type=Path, default='data/kaggle-lit-review.json'), + apb.add_opts(opt('--dataset', type=Path, default='data/kaggle-lit-review-0.1.json'), opt('--method', required=True, type=str, choices=METHOD_CHOICES), opt('--model-name', type=str), + opt('--split', type=str, default='nq', choices=('nq', 'kq')), opt('--batch-size', '-bsz', type=int, default=96), opt('--device', type=str, default='cuda:0'), + opt('--tokenizer-name', type=str), + opt('--do-lower-case', action='store_true'), opt('--metrics', type=str, nargs='+', default=metric_names(), choices=metric_names())) args = apb.parser.parse_args() - options = KaggleEvaluationOptions(**vars(args)) ds = LitReviewDataset.from_file(str(options.dataset)) - examples = ds.to_senticized_dataset(SETTINGS.cord19_index_path) - construct_map = dict(transformer=construct_transformer, bm25=construct_bm25, t5=construct_t5) + examples = ds.to_senticized_dataset(SETTINGS.cord19_index_path, split=options.split) + construct_map = dict(transformer=construct_transformer, + bm25=construct_bm25, + t5=construct_t5, + seq_class_transformer=construct_seq_class_transformer, + qa_transformer=construct_qa_transformer, + random=lambda _: RandomReranker()) reranker = construct_map[options.method](options) evaluator = RerankerEvaluator(reranker, options.metrics) width = max(map(len, args.metrics)) + 1 stdout = [] for metric in evaluator.evaluate(examples): logging.info(f'{metric.name:<{width}}{metric.value:.5}') - stdout.append(f'{metric.name.title()}\t{metric.value}') + stdout.append(f'{metric.name}\t{metric.value}') print('\n'.join(stdout)) diff --git a/scripts/evaluate-highlighters.sh b/scripts/evaluate-highlighters.sh index c7599d1b..e7344374 100644 --- a/scripts/evaluate-highlighters.sh +++ b/scripts/evaluate-highlighters.sh @@ -1,7 +1,13 @@ mkdir -p results -python -um pygaggle.run.evaluate_kaggle_highlighter --method bm25 > results/bm25.log -python -um pygaggle.run.evaluate_kaggle_highlighter --method t5 > results/t5.log -python -um pygaggle.run.evaluate_kaggle_highlighter --method transformer --model-name biobert > results/biobert.log -python -um pygaggle.run.evaluate_kaggle_highlighter --method transformer --model-name allenai/scibert_scivocab_cased > results/scibert.log -python -um pygaggle.run.evaluate_kaggle_highlighter --method transformer --model-name bert-base-cased > results/bert.log +for split in kq nq; do + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method random > results/random-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method bm25 > results/bm25-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method t5 > results/t5-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name bert-base-cased > results/bbc-unsup-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name biobert > results/biobert-unsup-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method transformer --model-name allenai/scibert_scivocab_cased > results/scibert-unsup-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method seq_class_transformer --model-name ~/models/biobert-msmarco > results/biobert-marco-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method seq_class_transformer --model-name ~/models/bbu-marco --do-lower-case > results/bert-marco-$split.log; + python -um pygaggle.run.evaluate_kaggle_highlighter --split $split --method qa_transformer --model-name ~/models/biobert-squad1 > results/biobert-squadv1-$split.log; +done for name in results/*; do echo $name; cat $name; echo; done diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..5bfd1ab2 --- /dev/null +++ b/setup.py @@ -0,0 +1,39 @@ +import setuptools + + +with open('README.md') as fh: + long_description = fh.read() + +reqs = [ + 'coloredlogs==14.0', + 'numpy==1.18.2', + 'pydantic==1.5', + 'pyserini==0.9.0.0', + 'scikit-learn>=0.22', + 'scipy>=1.4', + 'spacy==2.2.4', + 'tensorboard>=2.1.0', + 'tensorflow>=2.2.0rc1', + 'tokenizers==0.5.2', + 'tqdm==4.45.0', + 'transformers==2.7.0' +] + +setuptools.setup( + name='pygaggle', + version='0.0.1', + author='PyGaggle Gaggle', + author_email='r33tang@uwaterloo.ca', + description='A gaggle of rerankers for CovidQA and CORD-19', + long_description=long_description, + long_description_content_type='text/markdown', + url='https://github.com/castorini/pygaggle', + install_requires=reqs, + packages=setuptools.find_packages(), + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.7', +) \ No newline at end of file