Merge pull request #79 from thammegowda/develop

v0.3.1
thammegowda · Oct 29, 2021 · 4380a10 · 4380a10
2 parents 9064629 + 6529744
commit 4380a10
Show file tree

Hide file tree

Showing 18 changed files with 643 additions and 182 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+*.egg-info
+*.pytest_cache
+
 # Byte-compiled / optimized / DLL files
 /tmp
 __pycache__/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,18 @@
 # Change Log
 
-## v0.3.0 - WIP
+## v0.3.1 - 20211028
+
+- Add support for recipes; list-recipe get-recipe subcommands added
+- add support for viewing stats of dataset; words, chars, segs
+- FIX url for UN dev and test sets (source was updated so we updated too)
+- Multilingual experiment support; ISO 639-3 code `mul` implies multilingual; e.g. mul-eng or eng-mul
+- `--dev` accepts multiple datasets, and merges it (useful for multilingual experiments) 
+- tar files are extracted before read (performance improvements)
+- setup.py: version and descriptions accessed via regex 
+
+---
+
+## v0.3.0 - 20211021
 
 > Big Changes: BCP-47, data compression 
 

diff --git a/README.md b/README.md
@@ -240,6 +240,23 @@ $ tree  data/deu-eng/
 └── train.stats.json
 ```
 
+## Recipes
+
+> Since v0.3.1
+
+Recipe is a set of datasets nominated for train, dev, and tests, and are meant to improve reproducibility of experiments.
+Recipes are loaded from 
+1. Default:  [`mtdata/recipe/recipes.yml`](mtdata/recipe/recipes.yml) from source code
+2. Cache dir: `$MTDATA/mtdata.recipe.yml` where `$MTDATA` has default of `~/.mtdata`
+3. Current dir: `$PWD/mtdata.recipe.yml`
+
+See [`mtdata/recipe/recipes.yml`](mtdata/recipe/recipes.yml) for format and examples.
+
+```bash
+mtdata list-recipe  # see all recipes
+mtdata get-recipe -ri <recipe_id> -o <out_dir>  # get recipe, recreate dataset
+```
+
 ## Language Name Standardization
 ### ISO 639 3 
 Internally, all language codes are mapped to ISO-639 3 codes.
@@ -287,6 +304,9 @@ print(iso3_code('eNgLIsH', fail_error=True))  # case doesnt matter
 ```
 
 ### BCP-47 
+
+> Since v0.3.0
+
 We used ISO 639-3 from the beginning, however, we soon faced the limitation that ISO 639-3 cannot distinguish script and region variants of language. So we have upgraded to BCP-47 like language tags in `v0.3.0`.
 
 * BCP47 uses two-letter codes to some and three-letter codes to the rest, we use three-letter codes to all languages.
@@ -305,9 +325,9 @@ Our tags are of form `xxx_Yyyy_ZZ` where
 Notes:
 * Region is preserved when available and left blank when unavailable
 * Script `Yyyy` is forcibly suppressed in obvious cases. E.g. `eng` is written using `Latn` script, writing `eng-Latn` is just awkward to read as `Latn` is default we suppress `Latn` script for English. On the other hand a language like `Kannada` is written using `Knda` script (`kan-Knda` -> `kan`), but occasionally written using `Latn` script, so `kan-Latn` is not suppressed. 
-* The information about whats default script is obtained from IANA language code registry 
+* The information about what is default script is obtained from IANA language code registry 
+* Language code `mul` stands for _multiple languages, and is used as a placeholder for multilingual datasets (See `mul-eng` to represent many-to-English dataset recipes in [(mtdata/recipe/recipes.yml](mtdata/recipe/recipes.yml))
 
-  
 #### Example:
 To inspect parsing/mapping, use `python -m mtdata.iso.bcp47 <args>` 
 

diff --git a/mtdata/__init__.py b/mtdata/__init__.py
@@ -3,27 +3,24 @@
 # Author: Thamme Gowda [tg (at) isi (dot) edu] 
 # Created: 4/4/20
 
-__version__ = '0.3.0'
+__version__ = '0.3.1'
 __description__ = 'mtdata is a tool to download datasets for machine translation'
 __author__ = 'Thamme Gowda'
 
 import logging as log
 from pathlib import Path
 import os
+import enlighten
+from ruamel.yaml import YAML
 
-
+yaml = YAML()
 debug_mode = False
 _log_format = '%(asctime)s %(module)s.%(funcName)s:%(lineno)s %(levelname)s:: %(message)s'
 log.basicConfig(level=log.INFO, datefmt='%Y-%m-%d %H:%M:%S', format=_log_format)
 cache_dir = Path(os.environ.get('MTDATA', '~/.mtdata')).expanduser()
 cached_index_file = cache_dir / f'mtdata.index.{__version__}.pkl'
-
-try:
-    import enlighten
-    pbar_man = enlighten.get_manager()
-except ImportError as e:
-    log.warning("enlighten package maybe required. please run 'pip install englighten'")
-    log.warning(e)
+FILE_LOCK_TIMEOUT = 2 * 60 * 60  # 2 hours
+pbar_man = enlighten.get_manager()
 
 
 class MTDataException(Exception):

diff --git a/mtdata/__main__.py b/mtdata/__main__.py
@@ -2,12 +2,15 @@
 #
 # Author: Thamme Gowda [tg (at) isi (dot) edu] 
 # Created: 4/4/20
-import errno
-from mtdata import main, log
 
-if __name__ == '__main__':
+def main():
+    from mtdata import main
     try:
         main.main()
     except BrokenPipeError as e:
         # this happens when piped to '| head' which aborts pipe after limit. And that is okay.
         pass
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mtdata/cache.py b/mtdata/cache.py
@@ -8,8 +8,9 @@
 from dataclasses import dataclass
 from pathlib import Path
 from mtdata.index import Entry
-from mtdata import log, __version__, pbar_man, MTDataException
+from mtdata import log, __version__, pbar_man, MTDataException, FILE_LOCK_TIMEOUT
 from mtdata.utils import ZipPath, TarPath
+from mtdata.parser import Parser
 from typing import List, Union
 
 import portalocker
@@ -19,7 +20,6 @@
 import requests
 import math
 
-MAX_TIMEOUT = 2 * 60 * 60  # 2 hours
 
 headers = {'User-Agent': f'mtdata downloader {__version__}; cURL and wget like.'}
 
@@ -43,6 +43,46 @@ def get_entry(self, entry: Entry, fix_missing=True) -> Union[Path, List[Path]]:
             local = self.get_local_in_paths(path=local, entry=entry)
         return local
 
+    def get_stats(self, entry: Entry):
+        path = self.get_entry(entry)
+        parser = Parser(path, ext=entry.in_ext or None, ent=entry)
+        count, skips, noise = 0, 0, 0
+        toks = [0, 0]
+        chars = [0, 0]
+        for rec in parser.read_segs():
+            if len(rec) < 2 or not rec[0] or not rec[1]:
+                skips += 1
+                continue
+            if entry.is_noisy(seg1=rec[0], seg2=rec[1]):
+                noise += 1
+                skips += 1
+                continue
+            count += 1
+            s1, s2 = rec[:2]  # get the first two recs
+            chars[0] += len(s1)
+            chars[1] += len(s2)
+            s1_tok, s2_tok = s1.split(), s2.split()
+            toks[0] += len(s1_tok)
+            toks[1] += len(s2_tok)
+
+        l1, l2 = entry.did.langs
+        l1, l2 = l1.lang, l2.lang
+        assert count > 0, f'No valid records are found for {entry.did}'
+        if l2 < l1:
+            l1, l2 = l2, l1
+            toks = toks[1], toks[0]
+            chars = chars[1], chars[0]
+        return {
+            'id': str(entry.did),
+            'segs': count,
+            'segs_err': skips,
+            'segs_noise': noise,
+            f'{l1}_toks': toks[0],
+            f'{l2}_toks': toks[1],
+            f'{l1}_chars': chars[0],
+            f'{l2}_chars': chars[0]
+        }
+
     def get_flag_file(self, file: Path):
         return file.with_name(file.name + '._valid')
 
@@ -74,39 +114,37 @@ def opus_xces_format(self, entry, fix_missing=True) -> List[Path]:
         l2_path = self.get_local_path(l2_url, fix_missing=fix_missing)
         return [align_file, l1_path, l2_path]
 
-    def get_local_in_paths(self, path:Path, entry: Entry,):
+    def get_local_in_paths(self, path: Path, entry: Entry,):
         in_paths = entry.in_paths
         if zipfile.is_zipfile(path):
             with zipfile.ZipFile(path) as root:
                 in_paths = self.match_globs(names=root.namelist(), globs=in_paths)
             return [ZipPath(path, p) for p in in_paths]   # stdlib is buggy, so I made a workaround
         elif tarfile.is_tarfile(path):
-            with tarfile.open(path, encoding='utf-8') as root:
-                in_paths = self.match_globs(names=root.getnames(), globs=in_paths)
             return [TarPath(path, p) for p in in_paths]
         else:
             raise Exception(f'Unable to read {entry.did}; the file is neither zip nor tar')
 
-    def download(self, url: str, save_at: Path):
+    def download(self, url: str, save_at: Path, timeout=(5, 10)):
         valid_flag = self.get_flag_file(save_at)
         lock_file = valid_flag.with_suffix("._lock")
         if valid_flag.exists() and save_at.exists():
             return save_at
         save_at.parent.mkdir(parents=True, exist_ok=True)
 
         log.info(f"Acquiring lock on {lock_file}")
-        with portalocker.Lock(lock_file, 'w', timeout=MAX_TIMEOUT) as fh:
+        with portalocker.Lock(lock_file, 'w', timeout=FILE_LOCK_TIMEOUT) as fh:
             # check if downloaded by  other parallel process
             if valid_flag.exists() and save_at.exists():
                 return save_at
-            log.info(f"Downloading {url} --> {save_at}")
-            resp = requests.get(url=url, allow_redirects=True, headers=headers, stream=True)
+            log.info(f"GET {url} → {save_at}")
+            resp = requests.get(url=url, allow_redirects=True, headers=headers, stream=True, timeout=timeout)
             assert resp.status_code == 200, resp.status_code
             buf_size = 2 ** 10
             n_buffers = math.ceil(int(resp.headers.get('Content-Length', '0')) / buf_size) or None
             desc = url
-            if len(desc) > 40:
-                desc = desc[:30] + '...' + desc[-10:]
+            if len(desc) > 60:
+                desc = desc[:30] + '...' + desc[-28:]
             with pbar_man.counter(color='green', total=n_buffers, unit='KiB', leave=False,
                                   desc=f"{desc}") as pbar, open(save_at, 'wb', buffering=2**24) as out:
                 for chunk in resp.iter_content(chunk_size=buf_size):