From 1c652bafddfccb1ed4b51589e86416365b6b3940 Mon Sep 17 00:00:00 2001 From: aakbik Date: Wed, 16 Jan 2019 11:44:33 +0100 Subject: [PATCH 1/4] GH-351: added big file loader util --- flair/file_utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/flair/file_utils.py b/flair/file_utils.py index 6395495d4..ca163af5e 100644 --- a/flair/file_utils.py +++ b/flair/file_utils.py @@ -11,6 +11,7 @@ import re from urllib.parse import urlparse +import mmap import requests # from allennlp.common.tqdm import Tqdm @@ -22,6 +23,20 @@ CACHE_ROOT = os.path.expanduser(os.path.join('~', '.flair')) +def load_big_file(f): + """ + Workaround for loading a big pickle file. Files over 2GB cause pickle errors on certin Mac and Windows distributions. + :param f: + :return: + """ + logger.info(f'loading big file {f}') + with open(f, 'r+b') as f_in: + # mmap seems to be much more memory efficient + bf = mmap.mmap(f_in.fileno(), 0) + f_in.close() + return bf + + def url_to_filename(url: str, etag: str = None) -> str: """ Converts a url into a filename in a reversible way. From 3a2b11a82770ab596596a7021806c4cb64d834b4 Mon Sep 17 00:00:00 2001 From: aakbik Date: Wed, 16 Jan 2019 14:01:12 +0100 Subject: [PATCH 2/4] GH-351: added big file deserialization --- flair/models/sequence_tagger_model.py | 8 ++++++-- flair/models/text_classification_model.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py index 8a45b8454..730b90890 100644 --- a/flair/models/sequence_tagger_model.py +++ b/flair/models/sequence_tagger_model.py @@ -238,10 +238,14 @@ def _load_state(cls, model_file: Union[str, Path]): # https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings with warnings.catch_warnings(): warnings.filterwarnings("ignore") + # load_big_file is a workaround by https://github.com/highway11git to load models on some Mac/Windows setups + # see https://github.com/zalandoresearch/flair/issues/351 if torch.cuda.is_available(): - state = torch.load(str(model_file)) + f = flair.file_utils.load_big_file(str(model_file)) + state = torch.load(f) else: - state = torch.load(str(model_file), map_location={'cuda:0': 'cpu'}) + f = flair.file_utils.load_big_file(str(model_file)) + state = torch.load(f, map_location={'cuda:0': 'cpu'}) return state def forward_loss(self, sentences: Union[List[Sentence], Sentence], sort=True) -> torch.tensor: diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py index a288bb509..de733a02b 100644 --- a/flair/models/text_classification_model.py +++ b/flair/models/text_classification_model.py @@ -140,10 +140,14 @@ def _load_state(cls, model_file: Union[str, Path]): # https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings with warnings.catch_warnings(): warnings.filterwarnings("ignore") + # load_big_file is a workaround by https://github.com/highway11git to load models on some Mac/Windows setups + # see https://github.com/zalandoresearch/flair/issues/351 if torch.cuda.is_available(): - state = torch.load(str(model_file)) + f = flair.file_utils.load_big_file(str(model_file)) + state = torch.load(f) else: - state = torch.load(str(model_file), map_location={'cuda:0': 'cpu'}) + f = flair.file_utils.load_big_file(str(model_file)) + state = torch.load(f, map_location={'cuda:0': 'cpu'}) return state def forward_loss(self, sentences: Union[List[Sentence], Sentence]) -> torch.tensor: From 8c27a344c251c4415cffee84f5e17aaf2a6f7568 Mon Sep 17 00:00:00 2001 From: aakbik Date: Thu, 17 Jan 2019 11:41:05 +0100 Subject: [PATCH 3/4] GH-80: added Spanish LM embeddings trained by @iamyihwa --- flair/embeddings.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/flair/embeddings.py b/flair/embeddings.py index 7a3904442..709970df1 100644 --- a/flair/embeddings.py +++ b/flair/embeddings.py @@ -571,6 +571,23 @@ def __init__(self, model: str, detach: bool = True, use_cache: bool = False, cac base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-eu-large-backward-v0.1.pt' model = cached_path(base_path, cache_dir=cache_dir) + # Spanish forward fast + elif model.lower() == 'spanish-forward-fast' or model.lower() == 'es-forward-fast': + base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt' + model = cached_path(base_path, cache_dir=cache_dir) + # Spanish backward fast + elif model.lower() == 'spanish-backward-fast' or model.lower() == 'es-backward-fast': + base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt' + model = cached_path(base_path, cache_dir=cache_dir) + + # Spanish forward + elif model.lower() == 'spanish-forward' or model.lower() == 'es-forward': + base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_forward_long/lm-es-forward.pt' + model = cached_path(base_path, cache_dir=cache_dir) + # Spanish backward + elif model.lower() == 'spanish-backward' or model.lower() == 'es-backward': + base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_backward_long/lm-es-backward.pt' + model = cached_path(base_path, cache_dir=cache_dir) elif not Path(model).exists(): raise ValueError(f'The given model "{model}" is not available or is not a valid path.') From ce2caf046d8e3cdedf19f57237c4fbda4033da8b Mon Sep 17 00:00:00 2001 From: aakbik Date: Thu, 17 Jan 2019 11:54:12 +0100 Subject: [PATCH 4/4] GH-80: added Spanish LM embeddings trained by @iamyihwa --- resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md index db8429c3f..4af76df04 100644 --- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md +++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md @@ -72,6 +72,10 @@ Currently, the following contextual string embeddings are provided (more coming) | 'portuguese-backward' | Portuguese | Added by [@ericlief](https://github.com/ericlief/language_models): Backward LM embeddings | | 'basque-forward' | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Forward LM embeddings | | 'basque-backward' | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Backward LM embeddings | +| 'spanish-forward' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Forward LM embeddings over Wikipedia | +| 'spanish-backward' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Backward LM embeddings over Wikipedia | +| 'spanish-forward-fast' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): CPU-friendly forward LM embeddings over Wikipedia | +| 'spanish-backward-fast' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): CPU-friendly backward LM embeddings over Wikipedia | So, if you want to load embeddings from the English news backward LM model, instantiate the method as follows: