From 2e2d5e5203b44afc1d9e0d99cb5e00a0f1ee84cb Mon Sep 17 00:00:00 2001
From: engisalor <50170623+engisalor@users.noreply.github.com>
Date: Tue, 18 Jun 2024 13:22:33 +0200
Subject: [PATCH] feat: use stanza, deprecated fasttext for langid

---
 corpusama/corpus/langid.py    | 27 ++++++++----
 corpusama/util/langid.py      | 77 ++++++++++++++++++++++-------------
 test/test_util/test_langid.py | 52 +++++++++++------------
 3 files changed, 93 insertions(+), 63 deletions(-)

diff --git a/corpusama/corpus/langid.py b/corpusama/corpus/langid.py
index 88a97c6..ce5bf8a 100644
--- a/corpusama/corpus/langid.py
+++ b/corpusama/corpus/langid.py
@@ -1,17 +1,23 @@
 """Methods to classify document languages and save results to the `_lang` table."""
-import fasttext
+# import fasttext
 import pandas as pd
-
-from corpusama.util import convert, langid, parallel, util
+import stanza 
+from corpusama.util import convert, langid, util
 
 # TODO requires unit testing
 
+nlp = stanza.Pipeline(
+    lang="multilingual",
+    processors="langid",
+    ld_batch_size = 64, # 64 Batch size to use for language identification
+    max_cache_size = 10, # 10 Max number of pipelines to cache
+    )
 
 def make_langid(
     self,
     table: str,
     chunksize: int = 5000,
-    cores=0,
+    # cores=0,
 ) -> None:
     """Generates language ID data in the `_lang` table.
 
@@ -27,13 +33,16 @@ def make_langid(
         query = "SELECT * FROM _pdf"
     if table == "_raw":
         query = "SELECT * FROM _raw WHERE body_html IS NOT null"
-    cores = parallel.set_cores(cores)
+    # cores = parallel.set_cores(cores)
     res = pd.read_sql(query, self.db.conn, chunksize=chunksize)
     pdf_dir = self.config.get("pdf_dir")
     text_column = self.config.get("text_column")
+    n = 0
     for df in res:
         add_langid = AddLangID(table, pdf_dir, text_column)
-        df = parallel.run(df, add_langid.make, cores)
+        n += 1
+        # df = parallel.run(df, add_langid.make, cores)
+        df = add_langid.make(df)
         df["lang_date"] = util.now()
         self.db.insert(df, "_lang")
 
@@ -55,12 +64,12 @@ def make(self, df: pd.DataFrame):
             is_file = True
             s = self._make_filepath(df)
         # run language id
-        model = fasttext.load_model(self.model_file)
+        # model = fasttext.load_model(self.model_file)
         lid = langid.LangID(
             s,
             self.sample_kwargs,
-            None,
-            model,
+            nlp, # stanza nlp obj,
+            None, # fastext model,
             self.threshold,
             is_file=is_file,
         )
diff --git a/corpusama/util/langid.py b/corpusama/util/langid.py
index 8b4ea16..d817ba6 100644
--- a/corpusama/util/langid.py
+++ b/corpusama/util/langid.py
@@ -47,8 +47,9 @@
 from logging.handlers import TimedRotatingFileHandler
 from time import perf_counter
 from typing import Callable
+from math import ceil
 
-import fasttext
+# import fasttext
 import numpy as np
 import pandas as pd
 import stanza
@@ -130,7 +131,7 @@ def sample_lines(
         return list(clean)[:sample_size]
 
 
-def _get_lines(s: str, is_file: bool, sample_kwargs: dict) -> dict:
+def _get_lines(s: str, is_file: bool, sample_kwargs: dict) -> list|dict:
     """Opens a file and runs sample_lines(): logs a warning if there's no content.
 
     Args:
@@ -216,7 +217,7 @@ def _sort_lines(lines: list, sample_kwargs: dict) -> dict:
 
 @_li_wrapper
 def identify_stanza(
-    s: str, is_file: bool, sample_kwargs: dict, nlp: stanza.Pipeline
+    s: str, is_file: bool, sample_kwargs: dict, nlp: stanza.Pipeline, chunksize: int = 1000000
 ) -> dict:
     """Runs Stanza LI on `s`, returns a dict with results.
 
@@ -225,17 +226,30 @@ def identify_stanza(
         is_file: Whether `s` is a filepath `True` or a text `False`.
         sample_kwargs: Args passed to `sample_lines()` and `clean_lines()`.
         nlp: Stanza NLP pipeline.
+        chunksize: Max bytes of text fed to Stanza at a time.
     """
+    results = {"langs": [], "bytes": []}
     sample = _get_lines(s, is_file, sample_kwargs)
     if not sample:
-        return {"langs": [], "bytes": []}
-    dt = _sort_lines(sample, sample_kwargs)
-    docs = [stanza.Document([], text=t) for t in dt["long"]]
-    nlp(docs)
-    return {
-        "langs": [doc.lang for doc in docs] + dt["langs_short"],
-        "bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"],
-    }
+        return results
+    size = len("".join(sample).encode("utf8"))
+    chunks = 1
+    if size > chunksize:
+        chunks = ceil(size/chunksize) # 2903051 breaks at this size
+        logging.info(f"split {size} bytes into {chunks} chunks")
+    
+    def _inner(batch) -> None:
+        dt = _sort_lines(batch, sample_kwargs)
+        docs = [stanza.Document([], text=t) for t in dt["long"]]
+        nlp(docs)
+        results["langs"].extend([doc.lang for doc in docs] + dt["langs_short"])
+        results["bytes"].extend([len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"])
+
+    for batch in np.array_split(sample, chunks):
+        _inner(batch)
+
+    return results
+
 
 
 @_li_wrapper
@@ -243,7 +257,7 @@ def identify_fasttext(
     s: str,
     is_file: bool,
     sample_kwargs: dict,
-    model: fasttext.FastText._FastText,
+    # model: fasttext.FastText._FastText,
 ) -> dict:
     """Runs fasttext LI on `s`, returns a dict with results.
 
@@ -253,17 +267,18 @@ def identify_fasttext(
         sample_kwargs: Args passes to `sample_lines()` and `clean_lines()`.
         model: A `_FastText` object.
     """
-    sample = _get_lines(s, is_file, sample_kwargs)
-    if not sample:
-        return {"langs": [], "scores": [], "bytes": []}
-    dt = _sort_lines(sample, sample_kwargs)
-    res = model.predict(dt["long"])
-    return {
-        "langs": [y.replace("__label__", "") for x in res[0] for y in x]
-        + dt["langs_short"],
-        "scores": [y for x in res[1] for y in x] + [1] * len(dt["langs_short"]),
-        "bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"],
-    }
+    print("WARNING: fasttext is deprecated as of v0.3.0")
+    # sample = _get_lines(s, is_file, sample_kwargs)
+    # if not sample:
+    #     return {"langs": [], "scores": [], "bytes": []}
+    # dt = _sort_lines(sample, sample_kwargs)
+    # res = model.predict(dt["long"])
+    # return {
+    #     "langs": [y.replace("__label__", "") for x in res[0] for y in x]
+    #     + dt["langs_short"],
+    #     "scores": [y for x in res[1] for y in x] + [1] * len(dt["langs_short"]),
+    #     "bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"],
+    # }
 
 
 def analyze(
@@ -322,7 +337,7 @@ def identify(
     s: str | list,
     sample_kwargs: dict,
     nlp: stanza.Pipeline | None,
-    model: fasttext.FastText._FastText | None,
+    model: None ,# fasttext.FastText._FastText | None,
     threshold: float = 0.6,
     columns: list = li_columns,
     is_file: bool = True,
@@ -487,15 +502,21 @@ def __init__(
         s: str | list,
         sample_kwargs: dict,
         nlp: stanza.Pipeline | None,
-        model: fasttext.FastText._FastText | None,
+        model: None, # fasttext.FastText._FastText | None,
         threshold: float,
         columns: list = li_columns,
         is_file: bool = True,
     ):
         self.df = identify(s, sample_kwargs, nlp, model, threshold, columns, is_file)
-        self.add_multiling()
-        self.add_l1()
-        self.add_l1_size()
+        if "lid" in self.df.columns:
+            self.add_multiling()
+            self.add_l1()
+            self.add_l1_size()
+        else:
+            self.df["lid"] = None
+            self.df["multiling"] = None
+            self.df["l1"] = None
+            self.df["l1_size"] = None
 
 
 def file_stats(files: list, out: str = "file-stats") -> None:
diff --git a/test/test_util/test_langid.py b/test/test_util/test_langid.py
index a4642d7..1a28aac 100644
--- a/test/test_util/test_langid.py
+++ b/test/test_util/test_langid.py
@@ -1,8 +1,8 @@
 import pathlib
 import unittest
 
-import fasttext
-import pandas as pd
+# import fasttext
+# import pandas as pd
 import stanza
 
 from corpusama.util import langid
@@ -16,7 +16,7 @@ def setUpClass(cls):
             processors="langid",
             download_method=None,
         )
-        cls.model = fasttext.load_model("./fastText/lid.176.bin")
+        # cls.model = fasttext.load_model("./fastText/lid.176.bin")
         cls.file = "test/test_util/text-file.txt"
         cls.files = [cls.file, "test/test_util/text-file-2.txt"]
         cls.empty_file = "test/test_util/empty-file.txt"
@@ -50,18 +50,18 @@ def test_identify_stanza_empty(self):
         dt = langid.identify_stanza(self.empty_file, True, self.sample_kwargs, self.nlp)
         self.assertEqual(dt["langs"], [])
 
-    def test_identify_fasttext(self):
-        dt = langid.identify_fasttext(self.file, True, self.sample_kwargs, self.model)
-        self.assertTrue("en" in dt["langs"])
+    # def test_identify_fasttext(self):
+    #     dt = langid.identify_fasttext(self.file, True, self.sample_kwargs, self.model)
+    #     self.assertTrue("en" in dt["langs"])
 
-    def test_fasttext_empty_full(self):
-        dt = langid.identify_fasttext(
-            self.empty_file, True, self.sample_kwargs, self.model
-        )
-        self.assertEqual(dt["langs"], [])
+    # def test_fasttext_empty_full(self):
+    #     dt = langid.identify_fasttext(
+    #         self.empty_file, True, self.sample_kwargs, self.model
+    #     )
+    #     self.assertEqual(dt["langs"], [])
 
     def test_identify(self):
-        df = langid.identify(self.file, self.sample_kwargs, self.nlp, self.model)
+        df = langid.identify(self.file, self.sample_kwargs, self.nlp, None)
         self.assertEqual(df["tool"][0], "stanza")
 
     def test_identify_empty_no_fa(self):
@@ -73,20 +73,20 @@ def test_identify_empty_no_fa(self):
         )
         self.assertEqual(df["tool"][0], "stanza")
 
-    def test_LangID_texts_only_fa_with_empty(self):
-        # NOTE: could break if text languages aren't predicted correctly
-        texts = [
-            "hello, my name is John\nand I speak English",
-            " ",
-            "hola, mi nombre es José\ny hablo español",
-        ]
-        lid = langid.LangID(
-            texts, self.sample_kwargs, None, self.model, 0.6, is_file=False
-        )
-        self.assertTrue(pd.isnull(lid.df["file"][0]))
-        self.assertEqual(lid.df["l1"][0], "en")
-        self.assertTrue(pd.isnull(lid.df["l1"][1]))
-        self.assertEqual(lid.df["l1"][2], "es")
+    # def test_LangID_texts_only_fa_with_empty(self):
+    #     # NOTE: could break if text languages aren't predicted correctly
+    #     texts = [
+    #         "hello, my name is John\nand I speak English",
+    #         " ",
+    #         "hola, mi nombre es José\ny hablo español",
+    #     ]
+    #     lid = langid.LangID(
+    #         texts, self.sample_kwargs, None, self.model, 0.6, is_file=False
+    #     )
+    #     self.assertTrue(pd.isnull(lid.df["file"][0]))
+    #     self.assertEqual(lid.df["l1"][0], "en")
+    #     self.assertTrue(pd.isnull(lid.df["l1"][1]))
+    #     self.assertEqual(lid.df["l1"][2], "es")
 
     def test_file_concat(self):
         out = [