Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-80: Spanish Flair Embeddings #400

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions flair/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,23 @@ def __init__(self, model: str, detach: bool = True, use_cache: bool = False, cac
base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/lm-eu-large-backward-v0.1.pt'
model = cached_path(base_path, cache_dir=cache_dir)

# Spanish forward fast
elif model.lower() == 'spanish-forward-fast' or model.lower() == 'es-forward-fast':
base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt'
model = cached_path(base_path, cache_dir=cache_dir)
# Spanish backward fast
elif model.lower() == 'spanish-backward-fast' or model.lower() == 'es-backward-fast':
base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt'
model = cached_path(base_path, cache_dir=cache_dir)

# Spanish forward
elif model.lower() == 'spanish-forward' or model.lower() == 'es-forward':
base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_forward_long/lm-es-forward.pt'
model = cached_path(base_path, cache_dir=cache_dir)
# Spanish backward
elif model.lower() == 'spanish-backward' or model.lower() == 'es-backward':
base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings-v0.4/language_model_es_backward_long/lm-es-backward.pt'
model = cached_path(base_path, cache_dir=cache_dir)

elif not Path(model).exists():
raise ValueError(f'The given model "{model}" is not available or is not a valid path.')
Expand Down
15 changes: 15 additions & 0 deletions flair/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import re
from urllib.parse import urlparse

import mmap
import requests

# from allennlp.common.tqdm import Tqdm
Expand All @@ -22,6 +23,20 @@
CACHE_ROOT = os.path.expanduser(os.path.join('~', '.flair'))


def load_big_file(f):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code belongs to PR #397. I guess you added it incidentally?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh oops yes

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll correct and do another PR

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great @alanakbik @tabergma !
Thanks a lot for adding it!
Glad to contribute a little bit after so much help from you guys! Thank you!

"""
Workaround for loading a big pickle file. Files over 2GB cause pickle errors on certin Mac and Windows distributions.
:param f:
:return:
"""
logger.info(f'loading big file {f}')
with open(f, 'r+b') as f_in:
# mmap seems to be much more memory efficient
bf = mmap.mmap(f_in.fileno(), 0)
f_in.close()
return bf


def url_to_filename(url: str, etag: str = None) -> str:
"""
Converts a url into a filename in a reversible way.
Expand Down
8 changes: 6 additions & 2 deletions flair/models/sequence_tagger_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,10 +238,14 @@ def _load_state(cls, model_file: Union[str, Path]):
# https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# load_big_file is a workaround by https://github.com/highway11git to load models on some Mac/Windows setups
# see https://github.com/zalandoresearch/flair/issues/351
if torch.cuda.is_available():
state = torch.load(str(model_file))
f = flair.file_utils.load_big_file(str(model_file))
state = torch.load(f)
else:
state = torch.load(str(model_file), map_location={'cuda:0': 'cpu'})
f = flair.file_utils.load_big_file(str(model_file))
state = torch.load(f, map_location={'cuda:0': 'cpu'})
return state

def forward_loss(self, sentences: Union[List[Sentence], Sentence], sort=True) -> torch.tensor:
Expand Down
8 changes: 6 additions & 2 deletions flair/models/text_classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,14 @@ def _load_state(cls, model_file: Union[str, Path]):
# https://docs.python.org/3/library/warnings.html#temporarily-suppressing-warnings
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
# load_big_file is a workaround by https://github.com/highway11git to load models on some Mac/Windows setups
# see https://github.com/zalandoresearch/flair/issues/351
if torch.cuda.is_available():
state = torch.load(str(model_file))
f = flair.file_utils.load_big_file(str(model_file))
state = torch.load(f)
else:
state = torch.load(str(model_file), map_location={'cuda:0': 'cpu'})
f = flair.file_utils.load_big_file(str(model_file))
state = torch.load(f, map_location={'cuda:0': 'cpu'})
return state

def forward_loss(self, sentences: Union[List[Sentence], Sentence]) -> torch.tensor:
Expand Down
4 changes: 4 additions & 0 deletions resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ Currently, the following contextual string embeddings are provided (more coming)
| 'portuguese-backward' | Portuguese | Added by [@ericlief](https://github.com/ericlief/language_models): Backward LM embeddings |
| 'basque-forward' | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Forward LM embeddings |
| 'basque-backward' | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Backward LM embeddings |
| 'spanish-forward' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Forward LM embeddings over Wikipedia |
| 'spanish-backward' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Backward LM embeddings over Wikipedia |
| 'spanish-forward-fast' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): CPU-friendly forward LM embeddings over Wikipedia |
| 'spanish-backward-fast' | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): CPU-friendly backward LM embeddings over Wikipedia |

So, if you want to load embeddings from the English news backward LM model, instantiate the method as follows:

Expand Down