Merge branch 'master' of github.com:ncbi-nlp/NCBI_BERT

ncbi-nlp · Jun 9, 2020 · 16f2f8a · 16f2f8a
2 parents 57d4360 + edaae99
commit 16f2f8a
Show file tree

Hide file tree

Showing 53 changed files with 5,544 additions and 6 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,23 @@
+                          PUBLIC DOMAIN NOTICE
+              National Center for Biotechnology Information
+
+This software/database is a "United States Government Work" under the terms of
+the United States Copyright Act.  It was written as part of the author's
+official duties as a United States Government employee and thus cannot be
+copyrighted.  This software/database is freely available to the public for use.
+The National Library of Medicine and the U.S. Government have not placed any
+restriction on its use or reproduction.
+
+Although all reasonable efforts have been taken to ensure the accuracy and
+reliability of the software and data, the NLM and the U.S. Government do not and
+cannot warrant the performance or results that may be obtained by using this
+software or data. The NLM and the U.S. Government disclaim all warranties,
+express or implied, including warranties of performance, merchantability or
+fitness for any particular purpose.
+
+Please cite the author in any work or product based on this material:
+
+Peng Y, Yan S, Lu Z. Transfer Learning in Biomedical Natural Language 
+Processing: An Evaluation of BERT and ELMo on Ten Benchmarking Datasets.
+In Proceedings of the 2019 Workshop on Biomedical Natural Language Processing 
+(BioNLP 2019). 2019:58-65.
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ python bluebert/run_bluebert_ner.py \
   --bert_config_file=$BlueBERT_DIR/bert_config.json \
   --init_checkpoint=$BlueBERT_DIR/bert_model.ckpt \
   --num_train_epochs=30.0 \
-  --do_lower_case=False \
+  --do_lower_case=true \
   --data_dir=$DATASET_DIR \
   --output_dir=$OUTPUT_DIR
 ```
@@ -89,7 +89,7 @@ python bluebert/run_bluebert.py \
   --num_train_epochs=10.0 \
   --data_dir=$DATASET_DIR \
   --output_dir=$OUTPUT_DIR \
-  --do_lower_case=False
+  --do_lower_case=true 
 ```
 
 The task name can be 
@@ -134,7 +134,7 @@ python bluebert/run_bluebert.py \
   --num_train_epochs=10.0 \
   --data_dir=$DATASET_DIR \
   --output_dir=$OUTPUT_DIR \
-  --do_lower_case=False
+  --do_lower_case=true 
 ```
 
 ## <a name="pubmed"></a>Preprocessed PubMed texts

diff --git a/elmo/elmoft.py b/elmo/elmoft.py
@@ -111,7 +111,7 @@ def __init__(self, lm_model, config, task_type, iactvtn='relu', oactvtn='sigmoid
                 self.seq2seq = None
                 encoder_odim = self.n_embd
             self.maxlen = self.task_params.setdefault('maxlen', 128)
-            self.norm = NORM_TYPE_MAP[norm_type](seqlen)
+            self.norm = NORM_TYPE_MAP[norm_type](self.maxlen)
             self.linear = nn.Sequential(nn.Linear(encoder_odim, fchdim), self._int_actvtn(), nn.Linear(fchdim, fchdim), self._int_actvtn(), nn.Linear(fchdim, num_lbs), self._out_actvtn()) if fchdim else nn.Sequential(nn.Linear(encoder_odim, num_lbs), self._out_actvtn())
         elif seq2vec:
             self.pool = None
@@ -529,6 +529,12 @@ def elmo_config(options_path, weights_path, elmoedim=1024, dropout=0.5):
 }
 
 
+def gen_pytorch_wrapper(mdl_type, mdl_name, **kwargs):
+    wrapper_cls = PytorchSeq2SeqWrapper if mdl_type == 'seq2seq' else PytorchSeq2VecWrapper
+    mdl_cls = PYTORCH_WRAPPER[mdl_name]
+    return wrapper_cls(module=mdl_cls(**kwargs))
+
+
 def gen_mdl(mdl_name, pretrained=True, use_gpu=False, distrb=False, dev_id=None):
     try:
         params = LM_PARAMS[PARAMS_MAP[mdl_name]]
@@ -671,8 +677,7 @@ def eval(clf, dataset, binlbr, clf_tknids, pad_val=0, task_type='mltc-clf', task
             tkns_tnsr = [batch_to_ids(tkns_tnsr[x]) for x in [0,1]]
             if (use_gpu): tkns_tnsr, lb_tnsr, pool_idx= [tkns_tnsr[x].to('cuda') for x in [0,1]] , lb_tnsr.to('cuda'), [pool_idx[x].to('cuda') for x in [0,1]]
         elif task_type == 'nmt':
-            # tkns_tnsr, lb_tnsr = [s.split(SC) for s in tkns_tnsr if (type(s) is str and s != '') and len(s) > 0], [list(map(int, s.split(SC))) for s in lb_tnsr if (type(s) is str and s != '') and len(s) > 0]
-            tkns_tnsr, lb_tnsr = zip(*[(sx.split(SC), list(map(int, sy.split(SC)))) for sx, sy in zip(tkns_tnsr, lb_tnsr) if ((type(sx) is str and sx != '') or len(sx) > 0) and ((type(sy) is str and sy != '') or len(sy) > 0)])
+            tkns_tnsr, lb_tnsr = [s.split(SC) for s in tkns_tnsr if (type(s) is str and s != '') or len(s) > 0], [list(map(int, s.split(SC))) for s in lb_tnsr if (type(s) is str and s != '') or len(s) > 0]
             if (len(tkns_tnsr) == 0 or len(lb_tnsr) == 0): continue
             tkns_tnsr = [s[:min(len(s), opts.maxlen)] + [''] * (opts.maxlen-len(s)) for s in tkns_tnsr]
             _lb_tnsr = lb_tnsr = torch.LongTensor([s[:min(len(s), opts.maxlen)] + [pad_val] * (opts.maxlen-len(s)) for s in lb_tnsr])

diff --git a/mt-bluebert/.gitignore b/mt-bluebert/.gitignore
@@ -0,0 +1,142 @@
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+mtdnn_env
+venv_windows
+mtdnn_env_apex
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+.DS_Store
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# IDE pycharm
+.idea/
+
+log/
+model/
+submission/
+save/
+book_corpus_test/
+book_corpus_train/
+checkpoints/
+.pt_description_history
+.git-credentials
+pt_bert/philly
+.vs
+*.pyproj
+pt_bert/checkpoint
+*/aml_experiments
+screenlog.*
+data
+pt_bert/scripts
+pt_bert/model_data
+screen*
+checkpoint
+*.sln
+dt_mtl
+philly
+bert_models
+run_baseline*
+mt_dnn_models
+*pyc
+run_test/
+experiments/superglue
diff --git a/mt-bluebert/LICENSE b/mt-bluebert/LICENSE
@@ -0,0 +1,22 @@
+                          PUBLIC DOMAIN NOTICE
+              National Center for Biotechnology Information
+
+This software/database is a "United States Government Work" under the terms of
+the United States Copyright Act.  It was written as part of the author's
+official duties as a United States Government employee and thus cannot be
+copyrighted.  This software/database is freely available to the public for use.
+The National Library of Medicine and the U.S. Government have not placed any
+restriction on its use or reproduction.
+
+Although all reasonable efforts have been taken to ensure the accuracy and
+reliability of the software and data, the NLM and the U.S. Government do not and
+cannot warrant the performance or results that may be obtained by using this
+software or data. The NLM and the U.S. Government disclaim all warranties,
+express or implied, including warranties of performance, merchantability or
+fitness for any particular purpose.
+
+Please cite the author in any work or product based on this material:
+
+Peng Y, Chen Q, Lu Z. An Empirical Study of Multi-Task Learning on BERT
+for Biomedical Text Mining. In Proceedings of the 2020 Workshop on Biomedical
+Natural Language Processing (BioNLP 2020). 2020.
diff --git a/mt-bluebert/README.md b/mt-bluebert/README.md
@@ -0,0 +1,77 @@
+# Multi-Task Learning on BERT for Biomedical Text Mining
+
+This repository provides codes and models of the Multi-Task Learning on BERT for Biomedical Text Mining. 
+The package is based on [`mt-dnn`](https://github.com/namisan/mt-dnn).
+
+## Pre-trained models
+
+The pre-trained MT-BlueBERT weights, vocab, and config files can be downloaded from:
+
+* [mt-bluebert-biomedical](https://github.com/yfpeng/mt-bluebert/releases/download/0.1/mt-bluebert-biomedical.pt)
+* [mt-bluebert-clinical](https://github.com/yfpeng/mt-bluebert/releases/download/0.1/mt-bluebert-clinical.pt)
+
+The benchmark datasets can be downloaded from [https://github.com/ncbi-nlp/BLUE_Benchmark](https://github.com/ncbi-nlp/BLUE_Benchmark)
+
+## Quick start
+
+### Setup Environment
+1. python3.6
+2. install requirements
+```bash
+pip install -r requirements.txt
+```
+
+### Download data
+Please refer to download BLUE_Benchmark: https://github.com/ncbi-nlp/BLUE_Benchmark
+
+
+### Preprocess data
+```bash
+bash ncbi_scripts/blue_prepro.sh
+```
+
+### Train a MT-DNN model
+```bash
+bash ncbi_scripts/run_blue_mt_dnn.sh
+```
+
+### Fine-tune a model
+```bash
+bash ncbi_scripts/run_blue_fine_tune.sh
+```
+
+### Convert Tensorflow BERT model to the MT-DNN format
+```bash
+python ncbi_scripts/convert_tf_to_pt.py --tf_checkpoint_root $SRC_ROOT --pytorch_checkpoint_path $DEST --encoder_type 1```
+```
+
+## Citing MT-BLUE
+
+Peng Y, Chen Q, Lu Z. An Empirical Study of Multi-Task Learning on BERT
+for Biomedical Text Mining. In Proceedings of the 2020 Workshop on Biomedical
+Natural Language Processing (BioNLP 2020). 2020.
+
+```
+@InProceedings{peng2019transfer,
+  author    = {Yifan Peng and Qingyu Chen and Zhiyong Lu},
+  title     = {An Empirical Study of Multi-Task Learning on BERT for Biomedical Text Mining},
+  booktitle = {Proceedings of the 2020 Workshop on Biomedical Natural Language Processing (BioNLP 2020)},
+  year      = {2020},
+}
+```
+
+## Acknowledgments
+
+This work was supported by the Intramural Research Programs of the National Institutes of Health, National Library of
+Medicine. This work was supported by the National Library of Medicine of the National Institutes of Health under award number K99LM013001-01.
+
+We are also grateful to the authors of BERT and mt-dnn to make the data and codes publicly available. 
+
+## Disclaimer
+
+This tool shows the results of research conducted in the Computational Biology Branch, NLM/NCBI. The information produced
+on this website is not intended for direct diagnostic use or medical decision-making without review and oversight
+by a clinical professional. Individuals should not change their health behavior solely on the basis of information
+produced on this website. NIH does not independently verify the validity or utility of the information produced
+by this tool. If you have questions about the information produced on this website, please see a health care
+professional. More information about NLM/NCBI's disclaimer policy is available.
diff --git a/mt-bluebert/mt_bluebert/__init__.py b/mt-bluebert/mt_bluebert/__init__.py