Skip to content

Commit

Permalink
feat: use stanza, deprecated fasttext for langid
Browse files Browse the repository at this point in the history
  • Loading branch information
engisalor committed Jun 18, 2024
1 parent f00606e commit 2e2d5e5
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 63 deletions.
27 changes: 18 additions & 9 deletions corpusama/corpus/langid.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@
"""Methods to classify document languages and save results to the `_lang` table."""
import fasttext
# import fasttext
import pandas as pd

from corpusama.util import convert, langid, parallel, util
import stanza
from corpusama.util import convert, langid, util

# TODO requires unit testing

nlp = stanza.Pipeline(
lang="multilingual",
processors="langid",
ld_batch_size = 64, # 64 Batch size to use for language identification
max_cache_size = 10, # 10 Max number of pipelines to cache
)

def make_langid(
self,
table: str,
chunksize: int = 5000,
cores=0,
# cores=0,
) -> None:
"""Generates language ID data in the `_lang` table.
Expand All @@ -27,13 +33,16 @@ def make_langid(
query = "SELECT * FROM _pdf"
if table == "_raw":
query = "SELECT * FROM _raw WHERE body_html IS NOT null"
cores = parallel.set_cores(cores)
# cores = parallel.set_cores(cores)
res = pd.read_sql(query, self.db.conn, chunksize=chunksize)
pdf_dir = self.config.get("pdf_dir")
text_column = self.config.get("text_column")
n = 0
for df in res:
add_langid = AddLangID(table, pdf_dir, text_column)
df = parallel.run(df, add_langid.make, cores)
n += 1
# df = parallel.run(df, add_langid.make, cores)
df = add_langid.make(df)
df["lang_date"] = util.now()
self.db.insert(df, "_lang")

Expand All @@ -55,12 +64,12 @@ def make(self, df: pd.DataFrame):
is_file = True
s = self._make_filepath(df)
# run language id
model = fasttext.load_model(self.model_file)
# model = fasttext.load_model(self.model_file)
lid = langid.LangID(
s,
self.sample_kwargs,
None,
model,
nlp, # stanza nlp obj,
None, # fastext model,
self.threshold,
is_file=is_file,
)
Expand Down
77 changes: 49 additions & 28 deletions corpusama/util/langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@
from logging.handlers import TimedRotatingFileHandler
from time import perf_counter
from typing import Callable
from math import ceil

import fasttext
# import fasttext
import numpy as np
import pandas as pd
import stanza
Expand Down Expand Up @@ -130,7 +131,7 @@ def sample_lines(
return list(clean)[:sample_size]


def _get_lines(s: str, is_file: bool, sample_kwargs: dict) -> dict:
def _get_lines(s: str, is_file: bool, sample_kwargs: dict) -> list|dict:
"""Opens a file and runs sample_lines(): logs a warning if there's no content.
Args:
Expand Down Expand Up @@ -216,7 +217,7 @@ def _sort_lines(lines: list, sample_kwargs: dict) -> dict:

@_li_wrapper
def identify_stanza(
s: str, is_file: bool, sample_kwargs: dict, nlp: stanza.Pipeline
s: str, is_file: bool, sample_kwargs: dict, nlp: stanza.Pipeline, chunksize: int = 1000000
) -> dict:
"""Runs Stanza LI on `s`, returns a dict with results.
Expand All @@ -225,25 +226,38 @@ def identify_stanza(
is_file: Whether `s` is a filepath `True` or a text `False`.
sample_kwargs: Args passed to `sample_lines()` and `clean_lines()`.
nlp: Stanza NLP pipeline.
chunksize: Max bytes of text fed to Stanza at a time.
"""
results = {"langs": [], "bytes": []}
sample = _get_lines(s, is_file, sample_kwargs)
if not sample:
return {"langs": [], "bytes": []}
dt = _sort_lines(sample, sample_kwargs)
docs = [stanza.Document([], text=t) for t in dt["long"]]
nlp(docs)
return {
"langs": [doc.lang for doc in docs] + dt["langs_short"],
"bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"],
}
return results
size = len("".join(sample).encode("utf8"))
chunks = 1
if size > chunksize:
chunks = ceil(size/chunksize) # 2903051 breaks at this size
logging.info(f"split {size} bytes into {chunks} chunks")

def _inner(batch) -> None:
dt = _sort_lines(batch, sample_kwargs)
docs = [stanza.Document([], text=t) for t in dt["long"]]
nlp(docs)
results["langs"].extend([doc.lang for doc in docs] + dt["langs_short"])
results["bytes"].extend([len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"])

for batch in np.array_split(sample, chunks):
_inner(batch)

return results



@_li_wrapper
def identify_fasttext(
s: str,
is_file: bool,
sample_kwargs: dict,
model: fasttext.FastText._FastText,
# model: fasttext.FastText._FastText,
) -> dict:
"""Runs fasttext LI on `s`, returns a dict with results.
Expand All @@ -253,17 +267,18 @@ def identify_fasttext(
sample_kwargs: Args passes to `sample_lines()` and `clean_lines()`.
model: A `_FastText` object.
"""
sample = _get_lines(s, is_file, sample_kwargs)
if not sample:
return {"langs": [], "scores": [], "bytes": []}
dt = _sort_lines(sample, sample_kwargs)
res = model.predict(dt["long"])
return {
"langs": [y.replace("__label__", "") for x in res[0] for y in x]
+ dt["langs_short"],
"scores": [y for x in res[1] for y in x] + [1] * len(dt["langs_short"]),
"bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"],
}
print("WARNING: fasttext is deprecated as of v0.3.0")
# sample = _get_lines(s, is_file, sample_kwargs)
# if not sample:
# return {"langs": [], "scores": [], "bytes": []}
# dt = _sort_lines(sample, sample_kwargs)
# res = model.predict(dt["long"])
# return {
# "langs": [y.replace("__label__", "") for x in res[0] for y in x]
# + dt["langs_short"],
# "scores": [y for x in res[1] for y in x] + [1] * len(dt["langs_short"]),
# "bytes": [len(x.encode("utf8")) for x in dt["long"]] + dt["bytes_short"],
# }


def analyze(
Expand Down Expand Up @@ -322,7 +337,7 @@ def identify(
s: str | list,
sample_kwargs: dict,
nlp: stanza.Pipeline | None,
model: fasttext.FastText._FastText | None,
model: None ,# fasttext.FastText._FastText | None,
threshold: float = 0.6,
columns: list = li_columns,
is_file: bool = True,
Expand Down Expand Up @@ -487,15 +502,21 @@ def __init__(
s: str | list,
sample_kwargs: dict,
nlp: stanza.Pipeline | None,
model: fasttext.FastText._FastText | None,
model: None, # fasttext.FastText._FastText | None,
threshold: float,
columns: list = li_columns,
is_file: bool = True,
):
self.df = identify(s, sample_kwargs, nlp, model, threshold, columns, is_file)
self.add_multiling()
self.add_l1()
self.add_l1_size()
if "lid" in self.df.columns:
self.add_multiling()
self.add_l1()
self.add_l1_size()
else:
self.df["lid"] = None
self.df["multiling"] = None
self.df["l1"] = None
self.df["l1_size"] = None


def file_stats(files: list, out: str = "file-stats") -> None:
Expand Down
52 changes: 26 additions & 26 deletions test/test_util/test_langid.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pathlib
import unittest

import fasttext
import pandas as pd
# import fasttext
# import pandas as pd
import stanza

from corpusama.util import langid
Expand All @@ -16,7 +16,7 @@ def setUpClass(cls):
processors="langid",
download_method=None,
)
cls.model = fasttext.load_model("./fastText/lid.176.bin")
# cls.model = fasttext.load_model("./fastText/lid.176.bin")
cls.file = "test/test_util/text-file.txt"
cls.files = [cls.file, "test/test_util/text-file-2.txt"]
cls.empty_file = "test/test_util/empty-file.txt"
Expand Down Expand Up @@ -50,18 +50,18 @@ def test_identify_stanza_empty(self):
dt = langid.identify_stanza(self.empty_file, True, self.sample_kwargs, self.nlp)
self.assertEqual(dt["langs"], [])

def test_identify_fasttext(self):
dt = langid.identify_fasttext(self.file, True, self.sample_kwargs, self.model)
self.assertTrue("en" in dt["langs"])
# def test_identify_fasttext(self):
# dt = langid.identify_fasttext(self.file, True, self.sample_kwargs, self.model)
# self.assertTrue("en" in dt["langs"])

def test_fasttext_empty_full(self):
dt = langid.identify_fasttext(
self.empty_file, True, self.sample_kwargs, self.model
)
self.assertEqual(dt["langs"], [])
# def test_fasttext_empty_full(self):
# dt = langid.identify_fasttext(
# self.empty_file, True, self.sample_kwargs, self.model
# )
# self.assertEqual(dt["langs"], [])

def test_identify(self):
df = langid.identify(self.file, self.sample_kwargs, self.nlp, self.model)
df = langid.identify(self.file, self.sample_kwargs, self.nlp, None)
self.assertEqual(df["tool"][0], "stanza")

def test_identify_empty_no_fa(self):
Expand All @@ -73,20 +73,20 @@ def test_identify_empty_no_fa(self):
)
self.assertEqual(df["tool"][0], "stanza")

def test_LangID_texts_only_fa_with_empty(self):
# NOTE: could break if text languages aren't predicted correctly
texts = [
"hello, my name is John\nand I speak English",
" ",
"hola, mi nombre es José\ny hablo español",
]
lid = langid.LangID(
texts, self.sample_kwargs, None, self.model, 0.6, is_file=False
)
self.assertTrue(pd.isnull(lid.df["file"][0]))
self.assertEqual(lid.df["l1"][0], "en")
self.assertTrue(pd.isnull(lid.df["l1"][1]))
self.assertEqual(lid.df["l1"][2], "es")
# def test_LangID_texts_only_fa_with_empty(self):
# # NOTE: could break if text languages aren't predicted correctly
# texts = [
# "hello, my name is John\nand I speak English",
# " ",
# "hola, mi nombre es José\ny hablo español",
# ]
# lid = langid.LangID(
# texts, self.sample_kwargs, None, self.model, 0.6, is_file=False
# )
# self.assertTrue(pd.isnull(lid.df["file"][0]))
# self.assertEqual(lid.df["l1"][0], "en")
# self.assertTrue(pd.isnull(lid.df["l1"][1]))
# self.assertEqual(lid.df["l1"][2], "es")

def test_file_concat(self):
out = [
Expand Down

0 comments on commit 2e2d5e5

Please sign in to comment.