Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Huggingface multi-modal transformers #15

Merged
merged 7 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/pipes/embeddings/huggingface-embedding.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# HuggingfaceEmbedding {: #edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding }

::: edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding
options:
heading_level: 2
show_bases: false
show_source: false
15 changes: 8 additions & 7 deletions docs/pipes/embeddings/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ td:nth-child(1), td:nth-child(2) {
}
</style>

| Factory name | Description |
|----------------------------------------------------------------------------------------------|--------------------------------------------------------------------|
| [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding] | A module that embeds the textual features of the blocks. |
| [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner] | Encodes boxes using a combination of multiple encoders |
| [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler] | Pools the output of a CNN over the elements of a box (like words) |
| [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding] | Encodes the layout of the boxes |
| [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer] | Contextualizes box representations using a transformer |
| Factory name | Description |
|-----------------------------------------------------------------------------------------------|-------------------------------------------------------------------|
| [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding] | A module that embeds the textual features of the blocks. |
| [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner] | Encodes boxes using a combination of multiple encoders |
| [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler] | Pools the output of a CNN over the elements of a box (like words) |
| [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding] | Encodes the layout of the boxes |
| [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer] | Contextualizes box representations using a transformer |
| [`huggingface-embedding`][edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding] | Box representations using a Huggingface multi-modal model. |

<!-- --8<-- [end:components] -->

Expand Down
7 changes: 2 additions & 5 deletions docs/recipes/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ model to decrease a given loss. The process of training a pipeline with EDS-PDF
config={
"embedding": model.get_pipe("embedding"),
"labels": [],
"activation": "relu",
},
)
```
Expand Down Expand Up @@ -190,10 +189,10 @@ def segmentation_adapter(

## Full example

Let's wrap the training code in a function, and make it callable from the command line !
Let's wrap the training code in a function, and make it callable from the command line using [confit](https://github.com/aphp/confit) !

???+ example "train.py"
```python linenums="1" hl_lines="16-27"
```python linenums="1"
import itertools
import json
from pathlib import Path
Expand Down Expand Up @@ -309,7 +308,6 @@ Let's wrap the training code in a function, and make it callable from the comman
config={
"embedding": model.get_pipe("embedding"),
"labels": [],
"activation": "relu",
},
)

Expand Down Expand Up @@ -483,7 +481,6 @@ def train_my_model(
- config={
- "embedding": model.get_pipe("embedding"),
- "labels": [],
- "activation": "relu",
- },
- )

Expand Down
25 changes: 18 additions & 7 deletions edspdf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,24 +588,28 @@ def collate(
return batch

def parameters(self):
"""Returns an iterator over the Pytorch parameters of the components in the
pipeline"""
return (p for n, p in self.named_parameters())

def named_parameters(self):
"""Returns an iterator over the Pytorch parameters of the components in the
pipeline"""
seen = set()
for name, component in self.pipeline:
if hasattr(component, "parameters"):
for param in component.parameters():
if hasattr(component, "named_parameters"):
for param_name, param in component.named_parameters():
if param in seen:
continue
seen.add(param)
yield param
yield f"{name}.{param_name}", param

def to(self, device: Optional[torch.device] = None):
"""Moves the pipeline to a given device"""
for name, component in self.trainable_pipes():
component.to(device)
return self

@contextmanager
def train(self, mode=True):
"""
Enables training mode on pytorch modules
Expand All @@ -616,12 +620,19 @@ def train(self, mode=True):
Whether to enable training or not
"""

class context:
def __enter__(self):
pass

def __exit__(ctx_self, type, value, traceback):
for name, proc in self.trainable_pipes():
proc.train(was_training[name])

was_training = {name: proc.training for name, proc in self.trainable_pipes()}
for name, proc in self.trainable_pipes():
proc.train(mode)
yield
for name, proc in self.trainable_pipes():
proc.train(was_training[name])

return context()

def score(self, docs: Sequence[PDFDoc], batch_size: int = None) -> Dict[str, Any]:
"""
Expand Down
10 changes: 0 additions & 10 deletions edspdf/pipes/classifiers/trainable.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from edspdf.registry import registry
from edspdf.structures import PDFDoc
from edspdf.trainable_pipe import Scorer, TrainablePipe
from edspdf.utils.torch import ActivationFunction, get_activation_function


def classifier_scorer(pairs):
Expand Down Expand Up @@ -70,7 +69,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]):
},
},
"labels": ["body", "pollution"],
"activation": "relu",
},
)
```
Expand All @@ -81,7 +79,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]):
[components.classifier]
@factory = "trainable-classifier"
labels = ["body", "pollution"]
activation = "relu"

[components.classifier.embedding]
@factory = "sub-box-cnn-pooler"
Expand All @@ -99,8 +96,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]):
Initial labels of the classifier (will be completed during initialization)
embedding: TrainablePipe[EmbeddingOutput]
Embedding module to encode the PDF boxes
activation: ActivationFunction
Name of the activation function
dropout_p: float
Dropout probability used on the output of the box and textual encoders
scorer: Scorer
Expand All @@ -111,8 +106,6 @@ def __init__(
self,
embedding: TrainablePipe[EmbeddingOutput],
labels: Sequence[str] = ("pollution",),
activation: ActivationFunction = "gelu",
dropout_p: float = 0.0,
scorer: Scorer = classifier_scorer,
pipeline: Pipeline = None,
name: str = "trainable-classifier",
Expand All @@ -128,9 +121,6 @@ def __init__(
in_features=self.embedding.output_size,
out_features=len(self.label_voc),
)
self.activation = get_activation_function(activation)
self.dropout = torch.nn.Dropout(dropout_p)

# Scoring function
self.score = scorer

Expand Down
46 changes: 12 additions & 34 deletions edspdf/pipes/embeddings/box_layout_preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
from typing import Any, Dict, Sequence
from typing import Any, Dict

import torch
from foldedtensor import FoldedTensor, as_folded_tensor
from typing_extensions import TypedDict

from edspdf import Pipeline, TrainablePipe, registry
from edspdf.structures import PDFDoc, TextBox
from edspdf.structures import PDFDoc

BoxLayoutBatch = TypedDict(
"BoxLayoutBatch",
{
"page": FoldedTensor,
"xmin": FoldedTensor,
"ymin": FoldedTensor,
"xmax": FoldedTensor,
Expand Down Expand Up @@ -60,39 +59,19 @@ def __init__(
):
super().__init__(pipeline, name)

def preprocess_boxes(self, boxes: Sequence[TextBox]):
box_pages = [box.page.page_num for box in boxes]

last_page = max(box_pages, default=0)

return {
"page": box_pages,
"xmin": [b.x0 for b in boxes],
"ymin": [b.y0 for b in boxes],
"xmax": [b.x1 for b in boxes],
"ymax": [b.y1 for b in boxes],
"width": [(b.x1 - b.x0) for b in boxes],
"height": [(b.y1 - b.y0) for b in boxes],
"first_page": [b.page_num == 0 for b in boxes],
"last_page": [b.page_num == last_page for b in boxes],
}

def preprocess(self, doc: PDFDoc, supervision: bool = False):
pages = doc.pages
box_pages = [[b.page.page_num for b in page.text_boxes] for page in pages]
last_page = max(box_pages, default=0)
[[b.page_num for b in page.text_boxes] for page in pages]
last_p = doc.num_pages - 1
return {
"page": box_pages,
"xmin": [[b.x0 for b in p.text_boxes] for p in pages],
"ymin": [[b.y0 for b in p.text_boxes] for p in pages],
"xmax": [[b.x1 for b in p.text_boxes] for p in pages],
"ymax": [[b.y1 for b in p.text_boxes] for p in pages],
"width": [[(b.x1 - b.x0) for b in p.text_boxes] for p in pages],
"height": [[(b.y1 - b.y0) for b in p.text_boxes] for p in pages],
"first_page": [[b.page.page_num == 0 for b in p.text_boxes] for p in pages],
"last_page": [
[b.page.page_num == last_page for b in p.text_boxes] for p in pages
],
"first_page": [[b.page_num == 0 for b in p.text_boxes] for p in pages],
"last_page": [[b.page_num == last_p for b in p.text_boxes] for p in pages],
}

def collate(self, batch, device: torch.device) -> BoxLayoutBatch:
Expand All @@ -103,13 +82,12 @@ def collate(self, batch, device: torch.device) -> BoxLayoutBatch:
}

return {
"page": as_folded_tensor(batch["page"], dtype=torch.long, **kw),
"xmin": as_folded_tensor(batch["xmin"], dtype=torch.long, **kw),
"ymin": as_folded_tensor(batch["ymin"], dtype=torch.long, **kw),
"xmax": as_folded_tensor(batch["xmax"], dtype=torch.long, **kw),
"ymax": as_folded_tensor(batch["ymax"], dtype=torch.long, **kw),
"width": as_folded_tensor(batch["width"], dtype=torch.long, **kw),
"height": as_folded_tensor(batch["height"], dtype=torch.long, **kw),
"xmin": as_folded_tensor(batch["xmin"], dtype=torch.float, **kw),
"ymin": as_folded_tensor(batch["ymin"], dtype=torch.float, **kw),
"xmax": as_folded_tensor(batch["xmax"], dtype=torch.float, **kw),
"ymax": as_folded_tensor(batch["ymax"], dtype=torch.float, **kw),
"width": as_folded_tensor(batch["width"], dtype=torch.float, **kw),
"height": as_folded_tensor(batch["height"], dtype=torch.float, **kw),
"first_page": as_folded_tensor(batch["first_page"], dtype=torch.bool, **kw),
"last_page": as_folded_tensor(batch["last_page"], dtype=torch.bool, **kw),
}
Expand Down
Loading