From edb811dca694bfcfed3dc547a3485e2791332236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Fri, 7 Jul 2023 12:22:17 +0200 Subject: [PATCH 1/7] feat: new wrapper for huggingface multi-modal transformers + windowing option --- .../pipes/embeddings/huggingface-embedding.md | 7 + docs/pipes/embeddings/index.md | 15 +- docs/recipes/training.md | 4 +- .../pipes/embeddings/huggingface_embedding.py | 288 ++++++++++++++++++ pyproject.toml | 1 + tests/recipes/test_train.py | 50 +++ 6 files changed, 356 insertions(+), 9 deletions(-) create mode 100644 docs/pipes/embeddings/huggingface-embedding.md create mode 100644 edspdf/pipes/embeddings/huggingface_embedding.py diff --git a/docs/pipes/embeddings/huggingface-embedding.md b/docs/pipes/embeddings/huggingface-embedding.md new file mode 100644 index 00000000..2fc97695 --- /dev/null +++ b/docs/pipes/embeddings/huggingface-embedding.md @@ -0,0 +1,7 @@ +# HuggingfaceEmbedding {: #edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding } + +::: edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding + options: + heading_level: 2 + show_bases: false + show_source: false diff --git a/docs/pipes/embeddings/index.md b/docs/pipes/embeddings/index.md index f17843da..1225c682 100644 --- a/docs/pipes/embeddings/index.md +++ b/docs/pipes/embeddings/index.md @@ -10,13 +10,14 @@ td:nth-child(1), td:nth-child(2) { } -| Factory name | Description | -|----------------------------------------------------------------------------------------------|--------------------------------------------------------------------| -| [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding] | A module that embeds the textual features of the blocks. | -| [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner] | Encodes boxes using a combination of multiple encoders | -| [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler] | Pools the output of a CNN over the elements of a box (like words) | -| [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding] | Encodes the layout of the boxes | -| [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer] | Contextualizes box representations using a transformer | +| Factory name | Description | +|-----------------------------------------------------------------------------------------------|-------------------------------------------------------------------| +| [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding] | A module that embeds the textual features of the blocks. | +| [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner] | Encodes boxes using a combination of multiple encoders | +| [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler] | Pools the output of a CNN over the elements of a box (like words) | +| [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding] | Encodes the layout of the boxes | +| [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer] | Contextualizes box representations using a transformer | +| [`huggingface-embedding`][edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding] | Box representations using a Huggingface multi-modal model. | diff --git a/docs/recipes/training.md b/docs/recipes/training.md index 8a447572..539ca543 100644 --- a/docs/recipes/training.md +++ b/docs/recipes/training.md @@ -190,10 +190,10 @@ def segmentation_adapter( ## Full example -Let's wrap the training code in a function, and make it callable from the command line ! +Let's wrap the training code in a function, and make it callable from the command line using [confit](https://github.com/aphp/confit) ! ???+ example "train.py" - ```python linenums="1" hl_lines="16-27" + ```python linenums="1" import itertools import json from pathlib import Path diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py new file mode 100644 index 00000000..65a0fc67 --- /dev/null +++ b/edspdf/pipes/embeddings/huggingface_embedding.py @@ -0,0 +1,288 @@ +import math + +import torch +from foldedtensor import as_folded_tensor +from transformers import AutoImageProcessor, AutoModel, AutoTokenizer +from typing_extensions import Literal + +from edspdf import TrainablePipe, registry +from edspdf.pipeline import Pipeline +from edspdf.pipes.embeddings import EmbeddingOutput +from edspdf.structures import PDFDoc + + +def compute_contextualization_scores(windows): + ramp = torch.arange(0, windows.shape[1], 1) + scores = ( + torch.min(ramp, windows.mask.sum(1, keepdim=True) - 1 - ramp) + .clamp(min=0) + .view(-1) + ) + return scores + + +@registry.factory.register("huggingface-embedding") +class HuggingfaceEmbedding(TrainablePipe[EmbeddingOutput]): + """ + The HuggingfaceEmbeddings component is a wrapper around the Huggingface multi-modal + models. Compared to using the raw Huggingface model, we offer a simple mechanism to + split long documents into strided windows before feeding them to the model. + + Examples + -------- + + Here is an example of how to define a pipeline with the HuggingfaceEmbedding + component: + + ```python + from edspdf import Pipeline + + pipeline = Pipeline() + pipeline.add_pipe( + "mupdf-extractor", + name="extractor", + config={ + "render_pages": True, + }, + ) + pipeline.add_pipe( + "huggingface-embedding", + name="embedding", + config={ + "model": "microsoft/layoutlmv3-base", + "use_image": False, + "window": 128, + "stride": 64, + "line_pooling": "mean", + }, + ) + model.add_pipe( + "trainable-classifier", + name="classifier", + config={ + "embedding": model.get_pipe("embedding"), + "labels": [], + "activation": "relu", + }, + ) + ``` + + This model can then be trained following the [training recipe](/recipes/training/). + + Parameters + ---------- + pipeline: Pipeline + The pipeline instance + name: str + The component name + model: str + The Huggingface model name or path + use_image: bool + Whether to use the image or not in the model + window: int + The window size to use when splitting long documents into smaller windows + before feeding them to the Transformer model (default: 510 = 512 - 2) + stride: int + The stride (distance between windows) to use when splitting long documents into + smaller windows: (default: 510 / 2 = 255) + line_pooling: Literal["mean", "max", "sum"] + The pooling strategy to use when combining the embeddings of the tokens in a + line into a single line embedding + """ + + def __init__( + self, + pipeline: Pipeline = None, + name: str = "huggingface-embedding", + model: str = None, + use_image: bool = True, + window: int = 510, + stride: int = 255, + line_pooling: Literal["mean", "max", "sum"] = "mean", + ): + super().__init__(pipeline, name) + self.use_image = use_image + self.image_processor = ( + AutoImageProcessor.from_pretrained(model, apply_ocr=False) + if use_image + else None + ) + self.tokenizer = AutoTokenizer.from_pretrained(model) + self.hf_model = AutoModel.from_pretrained(model) + self.output_size = self.hf_model.config.hidden_size + self.window = window + self.stride = stride + self.line_pooling = line_pooling + + def preprocess(self, doc: PDFDoc): + res = { + "input_ids": [], + "bbox": [], + "windows": [], + "line_starts": [], + } + if self.use_image: + res["pixel_values"] = [] + + for page in doc.pages: + # Preprocess it using LayoutLMv3 + prep = self.tokenizer( + text=[line.text for line in doc.text_boxes], + boxes=[ + ( + int(line.x0 * line.page.width), + int(line.y0 * line.page.height), + int(line.x1 * line.page.width), + int(line.y1 * line.page.height), + ) + for line in doc.text_boxes + ], + word_labels=range(len(doc.text_boxes)), + return_attention_mask=True, + ) + if self.use_image: + prep.update(self.image_processor(images=page.image)) + + # Compute line offsets into layoutlm generated tokens + line_indices = prep["labels"][:-1] + line_starts = [ + i + for i, curr_index in enumerate(line_indices) + if curr_index != -100 and curr_index != line_indices[i - 1] + ] + + res["input_ids"].append(prep["input_ids"]) + res["bbox"].append(prep["bbox"]) + res["line_starts"].append(line_starts) + if self.use_image: + res["pixel_values"].append(prep["pixel_values"][0]) + + return res + + def collate(self, batch, device): + # Flatten most of these arrays to process batches page per page and + # not sample per sample + + offset = 0 + window_max_size = 0 + window_count = 0 + windows_per_page = [] + for sample_input_ids in batch["input_ids"]: + for page_input_ids in sample_input_ids: + # fmt: off + windows_per_page.append([ + [ + offset + 0, + *range(1 + offset + window_i * self.stride, + 1 + offset + min(window_i * self.stride + self.window, len(page_input_ids) - 2)), # noqa: E501 + offset + len(page_input_ids) - 1, + ] + for window_i in range(0, 1 + max(0, math.ceil((len(page_input_ids) - 2 - self.window) / self.stride))) # noqa: E501 + ]) + # fmt: on + offset += len(page_input_ids) + window_max_size = max( + window_max_size, max(map(len, windows_per_page[-1])) + ) + window_count += len(windows_per_page[-1]) + + windows = as_folded_tensor( + windows_per_page, + full_names=("page", "window", "token"), + data_dims=("window", "token"), + dtype=torch.long, + ) + indexer = torch.zeros(windows.max() + 1, dtype=torch.long) + + # Sort each occurrence of an initial token by its contextualization score: + # We can only use the amax reduction, so to retrieve the best occurrence, we + # insert the index of the token output by the transformer inside the score + # using a lexicographic approach + # (score + index / n_tokens) ~ (score * n_tokens + index), taking the max, + # and then retrieving the index of the token using the modulo operator. + scores = compute_contextualization_scores(windows) + scores = scores * len(scores) + torch.arange(len(scores)) + indexer.index_reduce_( + dim=0, + source=scores, + index=windows.view(-1), + reduce="amax", + ) + indexer %= len(scores) + + # Get token indices for each line -> sample, page, line, token + line_window_indices = [] + line_window_offsets_flat = [0] + offset = 0 + for sample_input_ids, sample_line_starts in zip( + batch["input_ids"], batch["line_starts"] + ): + sample_line_window_indices = [] + line_window_indices.append(sample_line_window_indices) + for page_line_starts, page_input_ids in zip( + sample_line_starts, sample_input_ids + ): + page_line_window_indices = [] + sample_line_window_indices.append(page_line_window_indices) + for line_start, line_end in zip( + page_line_starts, (*page_line_starts[1:], len(page_input_ids)) + ): + line_window_offsets_flat.append( + line_window_offsets_flat[-1] + line_end - line_start + ) + page_line_window_indices.append( + list(range(offset + line_start, offset + line_end)) + ) + offset += len(page_input_ids) + line_window_indices = as_folded_tensor( + line_window_indices, + full_names=("sample", "page", "line", "token"), + data_dims=("token",), + dtype=torch.long, + ) + line_window_offsets_flat = as_folded_tensor( + # discard the last offset, since we start from 0 and add each line length + data=torch.as_tensor(line_window_offsets_flat[:-1]), + data_dims=("line",), + full_names=("sample", "page", "line"), + lengths=line_window_indices.lengths[:-1], + ) + + kw = dict( + full_names=("sample", "page", "subword"), + data_dims=("subword",), + device=device, + ) + collated = { + "input_ids": as_folded_tensor(batch["input_ids"], **kw, dtype=torch.long), + "bbox": as_folded_tensor(batch["bbox"], **kw, dtype=torch.long), + "windows": windows, + "indexer": indexer[line_window_indices], + "line_window_indices": indexer[line_window_indices].as_tensor(), + "line_window_offsets_flat": line_window_offsets_flat, + } + if self.use_image: + collated["pixel_values"] = torch.stack( + [ + torch.as_tensor(x, device=device) + for x in as_folded_tensor( + batch["pixel_values"], **kw, dtype=torch.long + ) + ], + dim=0, + ) + return collated + + def forward(self, batch): + token_embeddings = self.hf_model.forward( + input_ids=batch["input_ids"].as_tensor()[batch["windows"]], + bbox=batch["bbox"].as_tensor()[batch["windows"]], + attention_mask=batch["windows"].mask, + ).last_hidden_state + line_embedding = torch.nn.functional.embedding_bag( + input=batch["line_window_indices"], + weight=token_embeddings.view(-1, token_embeddings.size(-1)), + offsets=batch["line_window_offsets_flat"], + mode=self.line_pooling, + ) + return {"embeddings": line_embedding} diff --git a/pyproject.toml b/pyproject.toml index a03a7091..63b906f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,6 +67,7 @@ docs = [ "box-preprocessor" = "edspdf.pipes.embeddings.box_layout_preprocessor:BoxLayoutPreprocessor" # Embeddings +"huggingface-embedding" = "edspdf.pipes.embeddings.huggingface_embedding:HuggingfaceEmbedding" "simple-text-embedding" = "edspdf.pipes.embeddings.simple_text_embedding:SimpleTextEmbedding" "sub-box-cnn-pooler" = "edspdf.pipes.embeddings.sub_box_cnn_pooler:SubBoxCNNPooler" "embedding-combiner" = "edspdf.pipes.embeddings.embedding_combiner:EmbeddingCombiner" diff --git a/tests/recipes/test_train.py b/tests/recipes/test_train.py index e3286b6e..e00f6b6b 100644 --- a/tests/recipes/test_train.py +++ b/tests/recipes/test_train.py @@ -278,3 +278,53 @@ def test_script(change_test_dir, dummy_dataset): ) assert result.exit_code == 0, result.stdout assert "Training model" in result.stdout + + +def test_function_huggingface(pdf, error_pdf, change_test_dir, dummy_dataset, tmp_path): + model = Pipeline() + model.add_pipe("pdfminer-extractor", name="extractor") + model.add_pipe( + "huggingface-embedding", + name="embedding", + config={ + "model": "microsoft/layoutlmv3-base", + "window": 128, + "stride": 64, + "use_image": False, + }, + ) + model.add_pipe( + "trainable-classifier", + name="classifier", + config={ + "embedding": model.get_pipe("embedding"), + "labels": [], + "activation": "relu", + }, + ) + trf = model.get_pipe("embedding") + trf.hf_model.encoder.layer = trf.hf_model.encoder.layer[:1] + + data_adapter = make_segmentation_adapter(dummy_dataset) + + train( + model=model, + train_data=data_adapter, + val_data=data_adapter, + max_steps=10, + batch_size=2, + validation_interval=4, + output_dir=tmp_path, + lr=0.001, + ) + + docs = list(data_adapter(model)) + + model = edspdf.load(tmp_path / "last-model") + + list(model.pipe([pdf] * 2 + [error_pdf] * 2)) + output = model(PDFDoc(content=pdf)) + + assert model.score(docs)["classifier"]["accuracy"] > 0.5 + + assert type(output) == PDFDoc From 02f3c057b6c660c8540dfc935c9ea8cb9cfdffaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Fri, 7 Jul 2023 15:52:54 +0200 Subject: [PATCH 2/7] fix: align image features with sliding text windows in hf transformers --- .../pipes/embeddings/huggingface_embedding.py | 40 +++++++++++-------- mkdocs.yml | 1 + pyproject.toml | 1 + 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py index 65a0fc67..39544c4e 100644 --- a/edspdf/pipes/embeddings/huggingface_embedding.py +++ b/edspdf/pipes/embeddings/huggingface_embedding.py @@ -166,11 +166,12 @@ def collate(self, batch, device): offset = 0 window_max_size = 0 window_count = 0 - windows_per_page = [] + windows = [] + windows_count_per_page = [] for sample_input_ids in batch["input_ids"]: for page_input_ids in sample_input_ids: # fmt: off - windows_per_page.append([ + windows.append([ [ offset + 0, *range(1 + offset + window_i * self.stride, @@ -179,15 +180,14 @@ def collate(self, batch, device): ] for window_i in range(0, 1 + max(0, math.ceil((len(page_input_ids) - 2 - self.window) / self.stride))) # noqa: E501 ]) + windows_count_per_page.append(len(windows[-1])) # fmt: on offset += len(page_input_ids) - window_max_size = max( - window_max_size, max(map(len, windows_per_page[-1])) - ) - window_count += len(windows_per_page[-1]) + window_max_size = max(window_max_size, max(map(len, windows[-1]))) + window_count += len(windows[-1]) windows = as_folded_tensor( - windows_per_page, + windows, full_names=("page", "window", "token"), data_dims=("window", "token"), dtype=torch.long, @@ -261,15 +261,19 @@ def collate(self, batch, device): "line_window_indices": indexer[line_window_indices].as_tensor(), "line_window_offsets_flat": line_window_offsets_flat, } + print(windows_count_per_page) if self.use_image: - collated["pixel_values"] = torch.stack( - [ - torch.as_tensor(x, device=device) - for x in as_folded_tensor( - batch["pixel_values"], **kw, dtype=torch.long - ) - ], - dim=0, + collated["pixel_values"] = ( + torch.stack( + [ + torch.from_numpy(page_pixels) + for sample_pages in batch["pixel_values"] + for page_pixels in sample_pages + ], + dim=0, + ) + .repeat_interleave(torch.as_tensor(windows_count_per_page), dim=0) + .to(device) ) return collated @@ -278,10 +282,12 @@ def forward(self, batch): input_ids=batch["input_ids"].as_tensor()[batch["windows"]], bbox=batch["bbox"].as_tensor()[batch["windows"]], attention_mask=batch["windows"].mask, - ).last_hidden_state + pixel_values=batch.get("pixel_values"), + ).last_hidden_state[:, : batch["windows"].shape[1]] + # TODO offset indices of line_window_indices instead of slicing token_embeddings line_embedding = torch.nn.functional.embedding_bag( input=batch["line_window_indices"], - weight=token_embeddings.view(-1, token_embeddings.size(-1)), + weight=token_embeddings.reshape(-1, token_embeddings.size(-1)), offsets=batch["line_window_offsets_flat"], mode=self.line_pooling, ) diff --git a/mkdocs.yml b/mkdocs.yml index 91db5409..d0d2b362 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -43,6 +43,7 @@ nav: - pipes/embeddings/sub-box-cnn-pooler.md - pipes/embeddings/box-layout-embedding.md - pipes/embeddings/box-transformer.md + - pipes/embeddings/huggingface-embedding.md - Extractors: - pipes/extractors/index.md - pipes/extractors/pdfminer.md diff --git a/pyproject.toml b/pyproject.toml index 63b906f2..4a1d79bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dev = [ "coverage>=6.5.0", "datasets~=2.10", "huggingface_hub>=0.8.1", + "transformers~=4.30", ] docs = [ "mike~=1.1.2", From db017099737539b654b8736241fb7c6d003d5abc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Fri, 7 Jul 2023 18:20:40 +0200 Subject: [PATCH 3/7] docs: update mkdocstrings dependencies --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4a1d79bf..03ca1b38 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,12 +49,12 @@ dev = [ docs = [ "mike~=1.1.2", "mkdocs@git+https://github.com/mkdocs/mkdocs.git@5af8bd30538ff8f0cfb698c8b90c3020da319f92", - "mkdocstrings==0.20.0", + "mkdocstrings~=0.20", + "mkdocstrings-python~=1.1", "mkdocs-autorefs@git+https://github.com/percevalw/mkdocs-autorefs.git@0.4.1.post0", "mkdocs-gen-files~=0.4.0", "mkdocs-literate-nav~=0.6.0", "mkdocs-material~=9.1.0", - "mkdocstrings-python~=0.8.3", "mkdocs-glightbox~=0.3.1", "pybtex~=0.24.0", ] From 3ed402a13f7309b2fc42cd59bdbc8f5d7bdf4698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Mon, 10 Jul 2023 11:28:10 +0200 Subject: [PATCH 4/7] draft --- edspdf/pipeline.py | 11 ++++++++--- edspdf/pipes/embeddings/huggingface_embedding.py | 9 ++++----- edspdf/structures.py | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/edspdf/pipeline.py b/edspdf/pipeline.py index d1652740..8944ade1 100644 --- a/edspdf/pipeline.py +++ b/edspdf/pipeline.py @@ -588,16 +588,21 @@ def collate( return batch def parameters(self): + """Returns an iterator over the Pytorch parameters of the components in the + pipeline""" + return (p for n, p in self.named_parameters()) + + def named_parameters(self): """Returns an iterator over the Pytorch parameters of the components in the pipeline""" seen = set() for name, component in self.pipeline: - if hasattr(component, "parameters"): - for param in component.parameters(): + if hasattr(component, "named_parameters"): + for param_name, param in component.named_parameters(): if param in seen: continue seen.add(param) - yield param + yield f"{name}.{param_name}", param def to(self, device: Optional[torch.device] = None): """Moves the pipeline to a given device""" diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py index 39544c4e..b30c7dfe 100644 --- a/edspdf/pipes/embeddings/huggingface_embedding.py +++ b/edspdf/pipes/embeddings/huggingface_embedding.py @@ -256,12 +256,11 @@ def collate(self, batch, device): collated = { "input_ids": as_folded_tensor(batch["input_ids"], **kw, dtype=torch.long), "bbox": as_folded_tensor(batch["bbox"], **kw, dtype=torch.long), - "windows": windows, - "indexer": indexer[line_window_indices], - "line_window_indices": indexer[line_window_indices].as_tensor(), - "line_window_offsets_flat": line_window_offsets_flat, + "windows": windows.to(device), + "indexer": indexer[line_window_indices].to(device), + "line_window_indices": indexer[line_window_indices].as_tensor().to(device), + "line_window_offsets_flat": line_window_offsets_flat.to(device), } - print(windows_count_per_page) if self.use_image: collated["pixel_values"] = ( torch.stack( diff --git a/edspdf/structures.py b/edspdf/structures.py index 46629a64..ba632bee 100644 --- a/edspdf/structures.py +++ b/edspdf/structures.py @@ -201,7 +201,7 @@ class Box(BaseModel): @property def page(self): - return self.doc.pages[self.page_num] + return next(p for p in self.doc.pages if p.page_num == self.page_num) def __lt__(self, other): self_page_num = self.page_num or 0 From 6b4c6db81c5ee504ad2202be0a764a7154ea17ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 20 Jul 2023 13:26:37 +0200 Subject: [PATCH 5/7] fix: foldedtensor, box coords as floats, last_page feature, unidecode alternative --- .../embeddings/box_layout_preprocessor.py | 45 ++++++------------- .../pipes/embeddings/simple_text_embedding.py | 4 +- pyproject.toml | 3 +- 3 files changed, 17 insertions(+), 35 deletions(-) diff --git a/edspdf/pipes/embeddings/box_layout_preprocessor.py b/edspdf/pipes/embeddings/box_layout_preprocessor.py index fc7b93e1..42363ec4 100644 --- a/edspdf/pipes/embeddings/box_layout_preprocessor.py +++ b/edspdf/pipes/embeddings/box_layout_preprocessor.py @@ -1,11 +1,11 @@ -from typing import Any, Dict, Sequence +from typing import Any, Dict import torch from foldedtensor import FoldedTensor, as_folded_tensor from typing_extensions import TypedDict from edspdf import Pipeline, TrainablePipe, registry -from edspdf.structures import PDFDoc, TextBox +from edspdf.structures import PDFDoc BoxLayoutBatch = TypedDict( "BoxLayoutBatch", @@ -60,27 +60,10 @@ def __init__( ): super().__init__(pipeline, name) - def preprocess_boxes(self, boxes: Sequence[TextBox]): - box_pages = [box.page.page_num for box in boxes] - - last_page = max(box_pages, default=0) - - return { - "page": box_pages, - "xmin": [b.x0 for b in boxes], - "ymin": [b.y0 for b in boxes], - "xmax": [b.x1 for b in boxes], - "ymax": [b.y1 for b in boxes], - "width": [(b.x1 - b.x0) for b in boxes], - "height": [(b.y1 - b.y0) for b in boxes], - "first_page": [b.page_num == 0 for b in boxes], - "last_page": [b.page_num == last_page for b in boxes], - } - def preprocess(self, doc: PDFDoc, supervision: bool = False): pages = doc.pages - box_pages = [[b.page.page_num for b in page.text_boxes] for page in pages] - last_page = max(box_pages, default=0) + box_pages = [[b.page_num for b in page.text_boxes] for page in pages] + last_p = max((p for x in box_pages for p in x), default=0) return { "page": box_pages, "xmin": [[b.x0 for b in p.text_boxes] for p in pages], @@ -89,10 +72,8 @@ def preprocess(self, doc: PDFDoc, supervision: bool = False): "ymax": [[b.y1 for b in p.text_boxes] for p in pages], "width": [[(b.x1 - b.x0) for b in p.text_boxes] for p in pages], "height": [[(b.y1 - b.y0) for b in p.text_boxes] for p in pages], - "first_page": [[b.page.page_num == 0 for b in p.text_boxes] for p in pages], - "last_page": [ - [b.page.page_num == last_page for b in p.text_boxes] for p in pages - ], + "first_page": [[b.page_num == 0 for b in p.text_boxes] for p in pages], + "last_page": [[b.page_num == last_p for b in p.text_boxes] for p in pages], } def collate(self, batch, device: torch.device) -> BoxLayoutBatch: @@ -103,13 +84,13 @@ def collate(self, batch, device: torch.device) -> BoxLayoutBatch: } return { - "page": as_folded_tensor(batch["page"], dtype=torch.long, **kw), - "xmin": as_folded_tensor(batch["xmin"], dtype=torch.long, **kw), - "ymin": as_folded_tensor(batch["ymin"], dtype=torch.long, **kw), - "xmax": as_folded_tensor(batch["xmax"], dtype=torch.long, **kw), - "ymax": as_folded_tensor(batch["ymax"], dtype=torch.long, **kw), - "width": as_folded_tensor(batch["width"], dtype=torch.long, **kw), - "height": as_folded_tensor(batch["height"], dtype=torch.long, **kw), + "page": as_folded_tensor(batch["page"], dtype=torch.float, **kw), + "xmin": as_folded_tensor(batch["xmin"], dtype=torch.float, **kw), + "ymin": as_folded_tensor(batch["ymin"], dtype=torch.float, **kw), + "xmax": as_folded_tensor(batch["xmax"], dtype=torch.float, **kw), + "ymax": as_folded_tensor(batch["ymax"], dtype=torch.float, **kw), + "width": as_folded_tensor(batch["width"], dtype=torch.float, **kw), + "height": as_folded_tensor(batch["height"], dtype=torch.float, **kw), "first_page": as_folded_tensor(batch["first_page"], dtype=torch.bool, **kw), "last_page": as_folded_tensor(batch["last_page"], dtype=torch.bool, **kw), } diff --git a/edspdf/pipes/embeddings/simple_text_embedding.py b/edspdf/pipes/embeddings/simple_text_embedding.py index 2b7af282..d849947a 100644 --- a/edspdf/pipes/embeddings/simple_text_embedding.py +++ b/edspdf/pipes/embeddings/simple_text_embedding.py @@ -5,6 +5,7 @@ import regex import torch +from anyascii import anyascii from foldedtensor import FoldedTensor, as_folded_tensor from typing_extensions import TypedDict @@ -208,8 +209,7 @@ def preprocess(self, doc: PDFDoc): words = [m.group(0) for m in self.word_regex.finditer(b.text)] for word in words: - # ascii_str = unidecode(word) - ascii_str = word + ascii_str = anyascii(word) tokens_shape[-1][i].append( self.shape_voc.encode(word_shape(ascii_str)) ) diff --git a/pyproject.toml b/pyproject.toml index 03ca1b38..4830223b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,12 +15,13 @@ dynamic = ["version"] requires-python = ">3.7.6,<4.0,!=3.8.1" dependencies = [ + "anyascii>=0.3.2", "scikit-learn>=1.0.2,<2.0.0", "pydantic>=1.2,<2.0.0", "catalogue~=2.0", "networkx~=2.6", "confit>=0.2.1,<1.0.0", - "foldedtensor>=0.2.1,<1.0.0", + "foldedtensor>=0.3.0,<1.0.0", "torch>1.0.0", "accelerate>=0.12.0,<1.0.0", "tqdm~=4.64.1", From 78f86e779d799aad217970810eb57a8831516483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Thu, 20 Jul 2023 13:37:42 +0200 Subject: [PATCH 6/7] feat: enable sub-batching in huggingface transformers to ease memory usage --- .../pipes/embeddings/huggingface_embedding.py | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py index b30c7dfe..88242dc9 100644 --- a/edspdf/pipes/embeddings/huggingface_embedding.py +++ b/edspdf/pipes/embeddings/huggingface_embedding.py @@ -88,6 +88,10 @@ class HuggingfaceEmbedding(TrainablePipe[EmbeddingOutput]): line_pooling: Literal["mean", "max", "sum"] The pooling strategy to use when combining the embeddings of the tokens in a line into a single line embedding + max_tokens_per_device: int + The maximum number of tokens that can be processed by the model on a single + device. This does not affect the results but can be used to reduce the memory + usage of the model, at the cost of a longer processing time. """ def __init__( @@ -99,6 +103,7 @@ def __init__( window: int = 510, stride: int = 255, line_pooling: Literal["mean", "max", "sum"] = "mean", + max_tokens_per_device: int = 128 * 128, ): super().__init__(pipeline, name) self.use_image = use_image @@ -113,6 +118,7 @@ def __init__( self.window = window self.stride = stride self.line_pooling = line_pooling + self.max_tokens_per_device = max_tokens_per_device def preprocess(self, doc: PDFDoc): res = { @@ -127,7 +133,7 @@ def preprocess(self, doc: PDFDoc): for page in doc.pages: # Preprocess it using LayoutLMv3 prep = self.tokenizer( - text=[line.text for line in doc.text_boxes], + text=[line.text for line in page.text_boxes], boxes=[ ( int(line.x0 * line.page.width), @@ -135,9 +141,9 @@ def preprocess(self, doc: PDFDoc): int(line.x1 * line.page.width), int(line.y1 * line.page.height), ) - for line in doc.text_boxes + for line in page.text_boxes ], - word_labels=range(len(doc.text_boxes)), + word_labels=range(len(page.text_boxes)), return_attention_mask=True, ) if self.use_image: @@ -240,9 +246,10 @@ def collate(self, batch, device): data_dims=("token",), dtype=torch.long, ) + last_after_one = max(1, len(line_window_offsets_flat) - 1) line_window_offsets_flat = as_folded_tensor( # discard the last offset, since we start from 0 and add each line length - data=torch.as_tensor(line_window_offsets_flat[:-1]), + data=torch.as_tensor(line_window_offsets_flat[:last_after_one]), data_dims=("line",), full_names=("sample", "page", "line"), lengths=line_window_indices.lengths[:-1], @@ -277,13 +284,31 @@ def collate(self, batch, device): return collated def forward(self, batch): - token_embeddings = self.hf_model.forward( - input_ids=batch["input_ids"].as_tensor()[batch["windows"]], - bbox=batch["bbox"].as_tensor()[batch["windows"]], - attention_mask=batch["windows"].mask, + windows = batch["windows"] + kwargs = dict( + input_ids=batch["input_ids"].as_tensor()[windows], + bbox=batch["bbox"].as_tensor()[windows], + attention_mask=windows.mask, pixel_values=batch.get("pixel_values"), - ).last_hidden_state[:, : batch["windows"].shape[1]] - # TODO offset indices of line_window_indices instead of slicing token_embeddings + ) + num_windows_per_batch = self.max_tokens_per_device // windows.shape[1] + + token_embeddings = [ + self.hf_model.forward( + **{ + k: None if v is None else v[offset : offset + num_windows_per_batch] + for k, v in kwargs.items() + } + ).last_hidden_state[:, : windows.shape[1]] + # TODO offset line_window_indices during collate + # instead of slicing token_embeddings + for offset in range(0, len(windows), num_windows_per_batch) + ] + token_embeddings = ( + torch.cat(token_embeddings, dim=0) + if len(token_embeddings) > 1 + else token_embeddings[0] + ) line_embedding = torch.nn.functional.embedding_bag( input=batch["line_window_indices"], weight=token_embeddings.reshape(-1, token_embeddings.size(-1)), From f8e9f6f441976ae7dbfc30ba56d38f19ea93e100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Wed, 26 Jul 2023 08:24:18 +0200 Subject: [PATCH 7/7] fix: edspdf 0.7.0 regression vs article --- docs/recipes/training.md | 3 --- edspdf/pipeline.py | 14 ++++++++++---- edspdf/pipes/classifiers/trainable.py | 10 ---------- edspdf/pipes/embeddings/box_layout_preprocessor.py | 7 ++----- edspdf/pipes/embeddings/huggingface_embedding.py | 1 - edspdf/pipes/embeddings/simple_text_embedding.py | 5 +++-- edspdf/pipes/embeddings/sub_box_cnn_pooler.py | 4 +++- edspdf/structures.py | 1 + pyproject.toml | 3 ++- tests/core/config.cfg | 1 - tests/recipes/config.cfg | 1 - tests/recipes/test_train.py | 2 -- 12 files changed, 21 insertions(+), 31 deletions(-) diff --git a/docs/recipes/training.md b/docs/recipes/training.md index 539ca543..5ee97433 100644 --- a/docs/recipes/training.md +++ b/docs/recipes/training.md @@ -63,7 +63,6 @@ model to decrease a given loss. The process of training a pipeline with EDS-PDF config={ "embedding": model.get_pipe("embedding"), "labels": [], - "activation": "relu", }, ) ``` @@ -309,7 +308,6 @@ Let's wrap the training code in a function, and make it callable from the comman config={ "embedding": model.get_pipe("embedding"), "labels": [], - "activation": "relu", }, ) @@ -483,7 +481,6 @@ def train_my_model( - config={ - "embedding": model.get_pipe("embedding"), - "labels": [], -- "activation": "relu", - }, - ) diff --git a/edspdf/pipeline.py b/edspdf/pipeline.py index 8944ade1..4f66824d 100644 --- a/edspdf/pipeline.py +++ b/edspdf/pipeline.py @@ -610,7 +610,6 @@ def to(self, device: Optional[torch.device] = None): component.to(device) return self - @contextmanager def train(self, mode=True): """ Enables training mode on pytorch modules @@ -621,12 +620,19 @@ def train(self, mode=True): Whether to enable training or not """ + class context: + def __enter__(self): + pass + + def __exit__(ctx_self, type, value, traceback): + for name, proc in self.trainable_pipes(): + proc.train(was_training[name]) + was_training = {name: proc.training for name, proc in self.trainable_pipes()} for name, proc in self.trainable_pipes(): proc.train(mode) - yield - for name, proc in self.trainable_pipes(): - proc.train(was_training[name]) + + return context() def score(self, docs: Sequence[PDFDoc], batch_size: int = None) -> Dict[str, Any]: """ diff --git a/edspdf/pipes/classifiers/trainable.py b/edspdf/pipes/classifiers/trainable.py index 239d4f26..db237663 100644 --- a/edspdf/pipes/classifiers/trainable.py +++ b/edspdf/pipes/classifiers/trainable.py @@ -15,7 +15,6 @@ from edspdf.registry import registry from edspdf.structures import PDFDoc from edspdf.trainable_pipe import Scorer, TrainablePipe -from edspdf.utils.torch import ActivationFunction, get_activation_function def classifier_scorer(pairs): @@ -70,7 +69,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]): }, }, "labels": ["body", "pollution"], - "activation": "relu", }, ) ``` @@ -81,7 +79,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]): [components.classifier] @factory = "trainable-classifier" labels = ["body", "pollution"] - activation = "relu" [components.classifier.embedding] @factory = "sub-box-cnn-pooler" @@ -99,8 +96,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]): Initial labels of the classifier (will be completed during initialization) embedding: TrainablePipe[EmbeddingOutput] Embedding module to encode the PDF boxes - activation: ActivationFunction - Name of the activation function dropout_p: float Dropout probability used on the output of the box and textual encoders scorer: Scorer @@ -111,8 +106,6 @@ def __init__( self, embedding: TrainablePipe[EmbeddingOutput], labels: Sequence[str] = ("pollution",), - activation: ActivationFunction = "gelu", - dropout_p: float = 0.0, scorer: Scorer = classifier_scorer, pipeline: Pipeline = None, name: str = "trainable-classifier", @@ -128,9 +121,6 @@ def __init__( in_features=self.embedding.output_size, out_features=len(self.label_voc), ) - self.activation = get_activation_function(activation) - self.dropout = torch.nn.Dropout(dropout_p) - # Scoring function self.score = scorer diff --git a/edspdf/pipes/embeddings/box_layout_preprocessor.py b/edspdf/pipes/embeddings/box_layout_preprocessor.py index 42363ec4..4e1d98c3 100644 --- a/edspdf/pipes/embeddings/box_layout_preprocessor.py +++ b/edspdf/pipes/embeddings/box_layout_preprocessor.py @@ -10,7 +10,6 @@ BoxLayoutBatch = TypedDict( "BoxLayoutBatch", { - "page": FoldedTensor, "xmin": FoldedTensor, "ymin": FoldedTensor, "xmax": FoldedTensor, @@ -62,10 +61,9 @@ def __init__( def preprocess(self, doc: PDFDoc, supervision: bool = False): pages = doc.pages - box_pages = [[b.page_num for b in page.text_boxes] for page in pages] - last_p = max((p for x in box_pages for p in x), default=0) + [[b.page_num for b in page.text_boxes] for page in pages] + last_p = doc.num_pages - 1 return { - "page": box_pages, "xmin": [[b.x0 for b in p.text_boxes] for p in pages], "ymin": [[b.y0 for b in p.text_boxes] for p in pages], "xmax": [[b.x1 for b in p.text_boxes] for p in pages], @@ -84,7 +82,6 @@ def collate(self, batch, device: torch.device) -> BoxLayoutBatch: } return { - "page": as_folded_tensor(batch["page"], dtype=torch.float, **kw), "xmin": as_folded_tensor(batch["xmin"], dtype=torch.float, **kw), "ymin": as_folded_tensor(batch["ymin"], dtype=torch.float, **kw), "xmax": as_folded_tensor(batch["xmax"], dtype=torch.float, **kw), diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py index 88242dc9..c790d5c1 100644 --- a/edspdf/pipes/embeddings/huggingface_embedding.py +++ b/edspdf/pipes/embeddings/huggingface_embedding.py @@ -62,7 +62,6 @@ class HuggingfaceEmbedding(TrainablePipe[EmbeddingOutput]): config={ "embedding": model.get_pipe("embedding"), "labels": [], - "activation": "relu", }, ) ``` diff --git a/edspdf/pipes/embeddings/simple_text_embedding.py b/edspdf/pipes/embeddings/simple_text_embedding.py index d849947a..71a2c5e9 100644 --- a/edspdf/pipes/embeddings/simple_text_embedding.py +++ b/edspdf/pipes/embeddings/simple_text_embedding.py @@ -209,7 +209,8 @@ def preprocess(self, doc: PDFDoc): words = [m.group(0) for m in self.word_regex.finditer(b.text)] for word in words: - ascii_str = anyascii(word) + # ascii_str = unidecode.unidecode(word) + ascii_str = anyascii(word).strip() tokens_shape[-1][i].append( self.shape_voc.encode(word_shape(ascii_str)) ) @@ -253,7 +254,7 @@ def forward(self, batch: BoxTextEmbeddingInputBatch) -> EmbeddingOutput: self.shape_embedding(batch["tokens_shape"].as_tensor()) + self.prefix_embedding(batch["tokens_prefix"].as_tensor()) + self.suffix_embedding(batch["tokens_suffix"].as_tensor()) - + self.norm_embedding(batch["tokens_norm"].as_tensor()) + # + self.norm_embedding(batch["tokens_norm"].as_tensor()) ) return {"embeddings": batch["tokens_shape"].with_data(text_embeds)} diff --git a/edspdf/pipes/embeddings/sub_box_cnn_pooler.py b/edspdf/pipes/embeddings/sub_box_cnn_pooler.py index 849123f1..a917281e 100644 --- a/edspdf/pipes/embeddings/sub_box_cnn_pooler.py +++ b/edspdf/pipes/embeddings/sub_box_cnn_pooler.py @@ -98,10 +98,12 @@ def forward(self, batch: Any) -> EmbeddingOutput: dim=2, ) pooled = box_token_embeddings.max(1).values + pooled = self.linear(pooled) + # print("TEXT EMBEDS", pooled.shape, pooled.sum()) return { "embeddings": as_folded_tensor( - data=self.linear(pooled), + data=pooled, lengths=embeddings.lengths[:-1], # pooled on the last dim data_dims=["line"], # fully flattened full_names=["sample", "page", "line"], diff --git a/edspdf/structures.py b/edspdf/structures.py index ba632bee..a8ab615a 100644 --- a/edspdf/structures.py +++ b/edspdf/structures.py @@ -89,6 +89,7 @@ class PDFDoc(BaseModel): content: bytes = attrs.field(repr=lambda c: f"{len(c)} bytes") id: str = None + num_pages: int = 0 pages: List["Page"] = attrs.field(factory=list) error: bool = False content_boxes: List[Union["TextBox"]] = attrs.field(factory=list) diff --git a/pyproject.toml b/pyproject.toml index 4830223b..09e25a4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,8 @@ dependencies = [ "pdfminer.six>=20220319", "pypdfium2~=2.7", "rich-logger>=0.3.0,<1.0.0", - "safetensors~=0.3.1" + "safetensors~=0.3.1", + "anyascii>=0.3.2", ] [project.optional-dependencies] diff --git a/tests/core/config.cfg b/tests/core/config.cfg index 0ea43772..d2b240cb 100644 --- a/tests/core/config.cfg +++ b/tests/core/config.cfg @@ -11,7 +11,6 @@ components = ${components} [components.classifier] @factory = "trainable-classifier" labels = [] -activation = "relu" [components.classifier.embedding] @factory = "box-transformer" diff --git a/tests/recipes/config.cfg b/tests/recipes/config.cfg index c7a61197..10910896 100644 --- a/tests/recipes/config.cfg +++ b/tests/recipes/config.cfg @@ -34,7 +34,6 @@ n_layers = 1 [components.classifier] @factory = "trainable-classifier" labels = [] -activation = "relu" embedding = ${components.embedding} [components.embedding.embedding] diff --git a/tests/recipes/test_train.py b/tests/recipes/test_train.py index e00f6b6b..f27185c6 100644 --- a/tests/recipes/test_train.py +++ b/tests/recipes/test_train.py @@ -230,7 +230,6 @@ def test_function(pdf, error_pdf, change_test_dir, dummy_dataset, tmp_path): config={ "embedding": model.get_pipe("embedding"), "labels": [], - "activation": "relu", }, ) print(model.config.to_str()) @@ -299,7 +298,6 @@ def test_function_huggingface(pdf, error_pdf, change_test_dir, dummy_dataset, tm config={ "embedding": model.get_pipe("embedding"), "labels": [], - "activation": "relu", }, ) trf = model.get_pipe("embedding")