From 2e2d5e5203b44afc1d9e0d99cb5e00a0f1ee84cb Mon Sep 17 00:00:00 2001 From: engisalor <50170623+engisalor@users.noreply.github.com> Date: Tue, 18 Jun 2024 13:22:33 +0200 Subject: [PATCH] feat: use stanza, deprecated fasttext for langid --- corpusama/corpus/langid.py | 27 ++++++++---- corpusama/util/langid.py | 77 ++++++++++++++++++++++------------- test/test_util/test_langid.py | 52 +++++++++++------------ 3 files changed, 93 insertions(+), 63 deletions(-) diff --git a/corpusama/corpus/langid.py b/corpusama/corpus/langid.py index 88a97c6..ce5bf8a 100644 --- a/corpusama/corpus/langid.py +++ b/corpusama/corpus/langid.py @@ -1,17 +1,23 @@ """Methods to classify document languages and save results to the `_lang` table.""" -import fasttext +# import fasttext import pandas as pd - -from corpusama.util import convert, langid, parallel, util +import stanza +from corpusama.util import convert, langid, util # TODO requires unit testing +nlp = stanza.Pipeline( + lang="multilingual", + processors="langid", + ld_batch_size = 64, # 64 Batch size to use for language identification + max_cache_size = 10, # 10 Max number of pipelines to cache + ) def make_langid( self, table: str, chunksize: int = 5000, - cores=0, + # cores=0, ) -> None: """Generates language ID data in the `_lang` table. @@ -27,13 +33,16 @@ def make_langid( query = "SELECT * FROM _pdf" if table == "_raw": query = "SELECT * FROM _raw WHERE body_html IS NOT null" - cores = parallel.set_cores(cores) + # cores = parallel.set_cores(cores) res = pd.read_sql(query, self.db.conn, chunksize=chunksize) pdf_dir = self.config.get("pdf_dir") text_column = self.config.get("text_column") + n = 0 for df in res: add_langid = AddLangID(table, pdf_dir, text_column) - df = parallel.run(df, add_langid.make, cores) + n += 1 + # df = parallel.run(df, add_langid.make, cores) + df = add_langid.make(df) df["lang_date"] = util.now() self.db.insert(df, "_lang") @@ -55,12 +64,12 @@ def make(self, df: pd.DataFrame): is_file = True s = self._make_filepath(df) # run language id - model = fasttext.load_model(self.model_file) + # model = fasttext.load_model(self.model_file) lid = langid.LangID( s, self.sample_kwargs, - None, - model, + nlp, # stanza nlp obj, + None, # fastext model, self.threshold, is_file=is_file, ) diff --git a/corpusama/util/langid.py b/corpusama/util/langid.py index 8b4ea16..d817ba6 100644 --- a/corpusama/util/langid.py +++ b/corpusama/util/langid.py @@ -47,8 +47,9 @@ from logging.handlers import TimedRotatingFileHandler from time import perf_counter from typing import Callable +from math import ceil -import fasttext +# import fasttext import numpy as np import pandas as pd import stanza @@ -130,7 +131,7 @@ def sample_lines( return list(clean)[:sample_size] -def _get_lines(s: str, is_file: bool, sample_kwargs: dict) -> dict: +def _get_lines(s: str, is_file: bool, sample_kwargs: dict) -> list|dict: """Opens a file and runs sample_lines(): logs a warning if there's no content. Args: @@ -216,7 +217,7 @@ def _sort_lines(lines: list, sample_kwargs: dict) -> dict: @_li_wrapper def identify_stanza( - s: str, is_file: bool, sample_kwargs: dict, nlp: stanza.Pipeline + s: str, is_file: bool, sample_kwargs: dict, nlp: stanza.Pipeline, chunksize: int = 1000000 ) -> dict: """Runs Stanza LI on `s`, returns a dict with results. @@ -225,17 +226,30 @@ def identify_stanza( is_file: Whether `s` is a filepath `True` or a text `False`. sample_kwargs: Args passed to `sample_lines()` and `clean_lines()`. nlp: Stanza NLP pipeline. + chunksize: Max bytes of text fed to Stanza at a time. """ + results = {"langs": [], "bytes": []} sample = _get_lines(s, is_file, sample_kwargs) if not sample: - return {"langs": [], "bytes": []} - dt = _sort_lines(sample, sample_kwargs) - docs = [stanza.Document([], text=t) for t in dt["long"]] - nlp(docs) - return { - "langs": [doc.lang for doc in docs] + dt["langs_short"], - "bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"], - } + return results + size = len("".join(sample).encode("utf8")) + chunks = 1 + if size > chunksize: + chunks = ceil(size/chunksize) # 2903051 breaks at this size + logging.info(f"split {size} bytes into {chunks} chunks") + + def _inner(batch) -> None: + dt = _sort_lines(batch, sample_kwargs) + docs = [stanza.Document([], text=t) for t in dt["long"]] + nlp(docs) + results["langs"].extend([doc.lang for doc in docs] + dt["langs_short"]) + results["bytes"].extend([len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"]) + + for batch in np.array_split(sample, chunks): + _inner(batch) + + return results + @_li_wrapper @@ -243,7 +257,7 @@ def identify_fasttext( s: str, is_file: bool, sample_kwargs: dict, - model: fasttext.FastText._FastText, + # model: fasttext.FastText._FastText, ) -> dict: """Runs fasttext LI on `s`, returns a dict with results. @@ -253,17 +267,18 @@ def identify_fasttext( sample_kwargs: Args passes to `sample_lines()` and `clean_lines()`. model: A `_FastText` object. """ - sample = _get_lines(s, is_file, sample_kwargs) - if not sample: - return {"langs": [], "scores": [], "bytes": []} - dt = _sort_lines(sample, sample_kwargs) - res = model.predict(dt["long"]) - return { - "langs": [y.replace("__label__", "") for x in res[0] for y in x] - + dt["langs_short"], - "scores": [y for x in res[1] for y in x] + [1] * len(dt["langs_short"]), - "bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"], - } + print("WARNING: fasttext is deprecated as of v0.3.0") + # sample = _get_lines(s, is_file, sample_kwargs) + # if not sample: + # return {"langs": [], "scores": [], "bytes": []} + # dt = _sort_lines(sample, sample_kwargs) + # res = model.predict(dt["long"]) + # return { + # "langs": [y.replace("__label__", "") for x in res[0] for y in x] + # + dt["langs_short"], + # "scores": [y for x in res[1] for y in x] + [1] * len(dt["langs_short"]), + # "bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"], + # } def analyze( @@ -322,7 +337,7 @@ def identify( s: str | list, sample_kwargs: dict, nlp: stanza.Pipeline | None, - model: fasttext.FastText._FastText | None, + model: None ,# fasttext.FastText._FastText | None, threshold: float = 0.6, columns: list = li_columns, is_file: bool = True, @@ -487,15 +502,21 @@ def __init__( s: str | list, sample_kwargs: dict, nlp: stanza.Pipeline | None, - model: fasttext.FastText._FastText | None, + model: None, # fasttext.FastText._FastText | None, threshold: float, columns: list = li_columns, is_file: bool = True, ): self.df = identify(s, sample_kwargs, nlp, model, threshold, columns, is_file) - self.add_multiling() - self.add_l1() - self.add_l1_size() + if "lid" in self.df.columns: + self.add_multiling() + self.add_l1() + self.add_l1_size() + else: + self.df["lid"] = None + self.df["multiling"] = None + self.df["l1"] = None + self.df["l1_size"] = None def file_stats(files: list, out: str = "file-stats") -> None: diff --git a/test/test_util/test_langid.py b/test/test_util/test_langid.py index a4642d7..1a28aac 100644 --- a/test/test_util/test_langid.py +++ b/test/test_util/test_langid.py @@ -1,8 +1,8 @@ import pathlib import unittest -import fasttext -import pandas as pd +# import fasttext +# import pandas as pd import stanza from corpusama.util import langid @@ -16,7 +16,7 @@ def setUpClass(cls): processors="langid", download_method=None, ) - cls.model = fasttext.load_model("./fastText/lid.176.bin") + # cls.model = fasttext.load_model("./fastText/lid.176.bin") cls.file = "test/test_util/text-file.txt" cls.files = [cls.file, "test/test_util/text-file-2.txt"] cls.empty_file = "test/test_util/empty-file.txt" @@ -50,18 +50,18 @@ def test_identify_stanza_empty(self): dt = langid.identify_stanza(self.empty_file, True, self.sample_kwargs, self.nlp) self.assertEqual(dt["langs"], []) - def test_identify_fasttext(self): - dt = langid.identify_fasttext(self.file, True, self.sample_kwargs, self.model) - self.assertTrue("en" in dt["langs"]) + # def test_identify_fasttext(self): + # dt = langid.identify_fasttext(self.file, True, self.sample_kwargs, self.model) + # self.assertTrue("en" in dt["langs"]) - def test_fasttext_empty_full(self): - dt = langid.identify_fasttext( - self.empty_file, True, self.sample_kwargs, self.model - ) - self.assertEqual(dt["langs"], []) + # def test_fasttext_empty_full(self): + # dt = langid.identify_fasttext( + # self.empty_file, True, self.sample_kwargs, self.model + # ) + # self.assertEqual(dt["langs"], []) def test_identify(self): - df = langid.identify(self.file, self.sample_kwargs, self.nlp, self.model) + df = langid.identify(self.file, self.sample_kwargs, self.nlp, None) self.assertEqual(df["tool"][0], "stanza") def test_identify_empty_no_fa(self): @@ -73,20 +73,20 @@ def test_identify_empty_no_fa(self): ) self.assertEqual(df["tool"][0], "stanza") - def test_LangID_texts_only_fa_with_empty(self): - # NOTE: could break if text languages aren't predicted correctly - texts = [ - "hello, my name is John\nand I speak English", - " ", - "hola, mi nombre es José\ny hablo español", - ] - lid = langid.LangID( - texts, self.sample_kwargs, None, self.model, 0.6, is_file=False - ) - self.assertTrue(pd.isnull(lid.df["file"][0])) - self.assertEqual(lid.df["l1"][0], "en") - self.assertTrue(pd.isnull(lid.df["l1"][1])) - self.assertEqual(lid.df["l1"][2], "es") + # def test_LangID_texts_only_fa_with_empty(self): + # # NOTE: could break if text languages aren't predicted correctly + # texts = [ + # "hello, my name is John\nand I speak English", + # " ", + # "hola, mi nombre es José\ny hablo español", + # ] + # lid = langid.LangID( + # texts, self.sample_kwargs, None, self.model, 0.6, is_file=False + # ) + # self.assertTrue(pd.isnull(lid.df["file"][0])) + # self.assertEqual(lid.df["l1"][0], "en") + # self.assertTrue(pd.isnull(lid.df["l1"][1])) + # self.assertEqual(lid.df["l1"][2], "es") def test_file_concat(self): out = [