From edb811dca694bfcfed3dc547a3485e2791332236 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Fri, 7 Jul 2023 12:22:17 +0200
Subject: [PATCH 1/7] feat: new wrapper for huggingface multi-modal
 transformers + windowing option

---
 .../pipes/embeddings/huggingface-embedding.md |   7 +
 docs/pipes/embeddings/index.md                |  15 +-
 docs/recipes/training.md                      |   4 +-
 .../pipes/embeddings/huggingface_embedding.py | 288 ++++++++++++++++++
 pyproject.toml                                |   1 +
 tests/recipes/test_train.py                   |  50 +++
 6 files changed, 356 insertions(+), 9 deletions(-)
 create mode 100644 docs/pipes/embeddings/huggingface-embedding.md
 create mode 100644 edspdf/pipes/embeddings/huggingface_embedding.py
diff --git a/docs/pipes/embeddings/huggingface-embedding.md b/docs/pipes/embeddings/huggingface-embedding.md
new file mode 100644
index 00000000..2fc97695
--- /dev/null
+++ b/docs/pipes/embeddings/huggingface-embedding.md
@@ -0,0 +1,7 @@
+# HuggingfaceEmbedding {: #edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding }
+
+::: edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
diff --git a/docs/pipes/embeddings/index.md b/docs/pipes/embeddings/index.md
index f17843da..1225c682 100644
--- a/docs/pipes/embeddings/index.md
+++ b/docs/pipes/embeddings/index.md
@@ -10,13 +10,14 @@ td:nth-child(1), td:nth-child(2) {
 }
 </style>
 
-| Factory name                                                                                 |  Description                                                       |
-|----------------------------------------------------------------------------------------------|--------------------------------------------------------------------|
-| [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding] |  A module that embeds the textual features of the blocks.          |
-| [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner]         |  Encodes boxes using a combination of multiple encoders            |
-| [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler]           |  Pools the output of a CNN over the elements of a box (like words) |
-| [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding]    |  Encodes the layout of the boxes                                   |
-| [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer]                  |  Contextualizes box representations using a transformer            |
+| Factory name                                                                                  | Description                                                       |
+|-----------------------------------------------------------------------------------------------|-------------------------------------------------------------------|
+| [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding]  | A module that embeds the textual features of the blocks.          |
+| [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner]          | Encodes boxes using a combination of multiple encoders            |
+| [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler]            | Pools the output of a CNN over the elements of a box (like words) |
+| [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding]     | Encodes the layout of the boxes                                   |
+| [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer]                   | Contextualizes box representations using a transformer            |
+| [`huggingface-embedding`][edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding] | Box representations using a Huggingface multi-modal model.        |
 
 <!-- --8<-- [end:components] -->
 
diff --git a/docs/recipes/training.md b/docs/recipes/training.md
index 8a447572..539ca543 100644
--- a/docs/recipes/training.md
+++ b/docs/recipes/training.md
@@ -190,10 +190,10 @@ def segmentation_adapter(
 
 ## Full example
 
-Let's wrap the training code in a function, and make it callable from the command line !
+Let's wrap the training code in a function, and make it callable from the command line using [confit](https://github.com/aphp/confit) !
 
 ???+ example "train.py"
-    ```python linenums="1" hl_lines="16-27"
+    ```python linenums="1"
     import itertools
     import json
     from pathlib import Path
diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py
new file mode 100644
index 00000000..65a0fc67
--- /dev/null
+++ b/edspdf/pipes/embeddings/huggingface_embedding.py
@@ -0,0 +1,288 @@
+import math
+
+import torch
+from foldedtensor import as_folded_tensor
+from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
+from typing_extensions import Literal
+
+from edspdf import TrainablePipe, registry
+from edspdf.pipeline import Pipeline
+from edspdf.pipes.embeddings import EmbeddingOutput
+from edspdf.structures import PDFDoc
+
+
+def compute_contextualization_scores(windows):
+    ramp = torch.arange(0, windows.shape[1], 1)
+    scores = (
+        torch.min(ramp, windows.mask.sum(1, keepdim=True) - 1 - ramp)
+        .clamp(min=0)
+        .view(-1)
+    )
+    return scores
+
+
+@registry.factory.register("huggingface-embedding")
+class HuggingfaceEmbedding(TrainablePipe[EmbeddingOutput]):
+    """
+    The HuggingfaceEmbeddings component is a wrapper around the Huggingface multi-modal
+    models. Compared to using the raw Huggingface model, we offer a simple mechanism to
+    split long documents into strided windows before feeding them to the model.
+
+    Examples
+    --------
+
+    Here is an example of how to define a pipeline with the HuggingfaceEmbedding
+    component:
+
+    ```python
+    from edspdf import Pipeline
+
+    pipeline = Pipeline()
+    pipeline.add_pipe(
+        "mupdf-extractor",
+        name="extractor",
+        config={
+            "render_pages": True,
+        },
+    )
+    pipeline.add_pipe(
+        "huggingface-embedding",
+        name="embedding",
+        config={
+            "model": "microsoft/layoutlmv3-base",
+            "use_image": False,
+            "window": 128,
+            "stride": 64,
+            "line_pooling": "mean",
+        },
+    )
+    model.add_pipe(
+        "trainable-classifier",
+        name="classifier",
+        config={
+            "embedding": model.get_pipe("embedding"),
+            "labels": [],
+            "activation": "relu",
+        },
+    )
+    ```
+
+    This model can then be trained following the [training recipe](/recipes/training/).
+
+    Parameters
+    ----------
+    pipeline: Pipeline
+        The pipeline instance
+    name: str
+        The component name
+    model: str
+        The Huggingface model name or path
+    use_image: bool
+        Whether to use the image or not in the model
+    window: int
+        The window size to use when splitting long documents into smaller windows
+        before feeding them to the Transformer model (default: 510 = 512 - 2)
+    stride: int
+        The stride (distance between windows) to use when splitting long documents into
+        smaller windows: (default: 510 / 2 = 255)
+    line_pooling: Literal["mean", "max", "sum"]
+        The pooling strategy to use when combining the embeddings of the tokens in a
+        line into a single line embedding
+    """
+
+    def __init__(
+        self,
+        pipeline: Pipeline = None,
+        name: str = "huggingface-embedding",
+        model: str = None,
+        use_image: bool = True,
+        window: int = 510,
+        stride: int = 255,
+        line_pooling: Literal["mean", "max", "sum"] = "mean",
+    ):
+        super().__init__(pipeline, name)
+        self.use_image = use_image
+        self.image_processor = (
+            AutoImageProcessor.from_pretrained(model, apply_ocr=False)
+            if use_image
+            else None
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model)
+        self.hf_model = AutoModel.from_pretrained(model)
+        self.output_size = self.hf_model.config.hidden_size
+        self.window = window
+        self.stride = stride
+        self.line_pooling = line_pooling
+
+    def preprocess(self, doc: PDFDoc):
+        res = {
+            "input_ids": [],
+            "bbox": [],
+            "windows": [],
+            "line_starts": [],
+        }
+        if self.use_image:
+            res["pixel_values"] = []
+
+        for page in doc.pages:
+            # Preprocess it using LayoutLMv3
+            prep = self.tokenizer(
+                text=[line.text for line in doc.text_boxes],
+                boxes=[
+                    (
+                        int(line.x0 * line.page.width),
+                        int(line.y0 * line.page.height),
+                        int(line.x1 * line.page.width),
+                        int(line.y1 * line.page.height),
+                    )
+                    for line in doc.text_boxes
+                ],
+                word_labels=range(len(doc.text_boxes)),
+                return_attention_mask=True,
+            )
+            if self.use_image:
+                prep.update(self.image_processor(images=page.image))
+
+            # Compute line offsets into layoutlm generated tokens
+            line_indices = prep["labels"][:-1]
+            line_starts = [
+                i
+                for i, curr_index in enumerate(line_indices)
+                if curr_index != -100 and curr_index != line_indices[i - 1]
+            ]
+
+            res["input_ids"].append(prep["input_ids"])
+            res["bbox"].append(prep["bbox"])
+            res["line_starts"].append(line_starts)
+            if self.use_image:
+                res["pixel_values"].append(prep["pixel_values"][0])
+
+        return res
+
+    def collate(self, batch, device):
+        # Flatten most of these arrays to process batches page per page and
+        # not sample per sample
+
+        offset = 0
+        window_max_size = 0
+        window_count = 0
+        windows_per_page = []
+        for sample_input_ids in batch["input_ids"]:
+            for page_input_ids in sample_input_ids:
+                # fmt: off
+                windows_per_page.append([
+                    [
+                        offset + 0,
+                        *range(1 + offset + window_i * self.stride,
+                               1 + offset + min(window_i * self.stride + self.window, len(page_input_ids) - 2)),  # noqa: E501
+                        offset + len(page_input_ids) - 1,
+                    ]
+                    for window_i in range(0, 1 + max(0, math.ceil((len(page_input_ids) - 2 - self.window) / self.stride)))  # noqa: E501
+                ])
+                # fmt: on
+                offset += len(page_input_ids)
+                window_max_size = max(
+                    window_max_size, max(map(len, windows_per_page[-1]))
+                )
+                window_count += len(windows_per_page[-1])
+
+        windows = as_folded_tensor(
+            windows_per_page,
+            full_names=("page", "window", "token"),
+            data_dims=("window", "token"),
+            dtype=torch.long,
+        )
+        indexer = torch.zeros(windows.max() + 1, dtype=torch.long)
+
+        # Sort each occurrence of an initial token by its contextualization score:
+        # We can only use the amax reduction, so to retrieve the best occurrence, we
+        # insert the index of the token output by the transformer inside the score
+        # using a lexicographic approach
+        # (score + index / n_tokens) ~ (score * n_tokens + index), taking the max,
+        # and then retrieving the index of the token using the modulo operator.
+        scores = compute_contextualization_scores(windows)
+        scores = scores * len(scores) + torch.arange(len(scores))
+        indexer.index_reduce_(
+            dim=0,
+            source=scores,
+            index=windows.view(-1),
+            reduce="amax",
+        )
+        indexer %= len(scores)
+
+        # Get token indices for each line -> sample, page, line, token
+        line_window_indices = []
+        line_window_offsets_flat = [0]
+        offset = 0
+        for sample_input_ids, sample_line_starts in zip(
+            batch["input_ids"], batch["line_starts"]
+        ):
+            sample_line_window_indices = []
+            line_window_indices.append(sample_line_window_indices)
+            for page_line_starts, page_input_ids in zip(
+                sample_line_starts, sample_input_ids
+            ):
+                page_line_window_indices = []
+                sample_line_window_indices.append(page_line_window_indices)
+                for line_start, line_end in zip(
+                    page_line_starts, (*page_line_starts[1:], len(page_input_ids))
+                ):
+                    line_window_offsets_flat.append(
+                        line_window_offsets_flat[-1] + line_end - line_start
+                    )
+                    page_line_window_indices.append(
+                        list(range(offset + line_start, offset + line_end))
+                    )
+                offset += len(page_input_ids)
+        line_window_indices = as_folded_tensor(
+            line_window_indices,
+            full_names=("sample", "page", "line", "token"),
+            data_dims=("token",),
+            dtype=torch.long,
+        )
+        line_window_offsets_flat = as_folded_tensor(
+            # discard the last offset, since we start from 0 and add each line length
+            data=torch.as_tensor(line_window_offsets_flat[:-1]),
+            data_dims=("line",),
+            full_names=("sample", "page", "line"),
+            lengths=line_window_indices.lengths[:-1],
+        )
+
+        kw = dict(
+            full_names=("sample", "page", "subword"),
+            data_dims=("subword",),
+            device=device,
+        )
+        collated = {
+            "input_ids": as_folded_tensor(batch["input_ids"], **kw, dtype=torch.long),
+            "bbox": as_folded_tensor(batch["bbox"], **kw, dtype=torch.long),
+            "windows": windows,
+            "indexer": indexer[line_window_indices],
+            "line_window_indices": indexer[line_window_indices].as_tensor(),
+            "line_window_offsets_flat": line_window_offsets_flat,
+        }
+        if self.use_image:
+            collated["pixel_values"] = torch.stack(
+                [
+                    torch.as_tensor(x, device=device)
+                    for x in as_folded_tensor(
+                        batch["pixel_values"], **kw, dtype=torch.long
+                    )
+                ],
+                dim=0,
+            )
+        return collated
+
+    def forward(self, batch):
+        token_embeddings = self.hf_model.forward(
+            input_ids=batch["input_ids"].as_tensor()[batch["windows"]],
+            bbox=batch["bbox"].as_tensor()[batch["windows"]],
+            attention_mask=batch["windows"].mask,
+        ).last_hidden_state
+        line_embedding = torch.nn.functional.embedding_bag(
+            input=batch["line_window_indices"],
+            weight=token_embeddings.view(-1, token_embeddings.size(-1)),
+            offsets=batch["line_window_offsets_flat"],
+            mode=self.line_pooling,
+        )
+        return {"embeddings": line_embedding}
diff --git a/pyproject.toml b/pyproject.toml
index a03a7091..63b906f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,6 +67,7 @@ docs = [
 "box-preprocessor" = "edspdf.pipes.embeddings.box_layout_preprocessor:BoxLayoutPreprocessor"
 
 # Embeddings
+"huggingface-embedding" = "edspdf.pipes.embeddings.huggingface_embedding:HuggingfaceEmbedding"
 "simple-text-embedding" = "edspdf.pipes.embeddings.simple_text_embedding:SimpleTextEmbedding"
 "sub-box-cnn-pooler" = "edspdf.pipes.embeddings.sub_box_cnn_pooler:SubBoxCNNPooler"
 "embedding-combiner" = "edspdf.pipes.embeddings.embedding_combiner:EmbeddingCombiner"
diff --git a/tests/recipes/test_train.py b/tests/recipes/test_train.py
index e3286b6e..e00f6b6b 100644
--- a/tests/recipes/test_train.py
+++ b/tests/recipes/test_train.py
@@ -278,3 +278,53 @@ def test_script(change_test_dir, dummy_dataset):
     )
     assert result.exit_code == 0, result.stdout
     assert "Training model" in result.stdout
+
+
+def test_function_huggingface(pdf, error_pdf, change_test_dir, dummy_dataset, tmp_path):
+    model = Pipeline()
+    model.add_pipe("pdfminer-extractor", name="extractor")
+    model.add_pipe(
+        "huggingface-embedding",
+        name="embedding",
+        config={
+            "model": "microsoft/layoutlmv3-base",
+            "window": 128,
+            "stride": 64,
+            "use_image": False,
+        },
+    )
+    model.add_pipe(
+        "trainable-classifier",
+        name="classifier",
+        config={
+            "embedding": model.get_pipe("embedding"),
+            "labels": [],
+            "activation": "relu",
+        },
+    )
+    trf = model.get_pipe("embedding")
+    trf.hf_model.encoder.layer = trf.hf_model.encoder.layer[:1]
+
+    data_adapter = make_segmentation_adapter(dummy_dataset)
+
+    train(
+        model=model,
+        train_data=data_adapter,
+        val_data=data_adapter,
+        max_steps=10,
+        batch_size=2,
+        validation_interval=4,
+        output_dir=tmp_path,
+        lr=0.001,
+    )
+
+    docs = list(data_adapter(model))
+
+    model = edspdf.load(tmp_path / "last-model")
+
+    list(model.pipe([pdf] * 2 + [error_pdf] * 2))
+    output = model(PDFDoc(content=pdf))
+
+    assert model.score(docs)["classifier"]["accuracy"] > 0.5
+
+    assert type(output) == PDFDoc

From 02f3c057b6c660c8540dfc935c9ea8cb9cfdffaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Fri, 7 Jul 2023 15:52:54 +0200
Subject: [PATCH 2/7] fix: align image features with sliding text windows in hf
 transformers

---
 .../pipes/embeddings/huggingface_embedding.py | 40 +++++++++++--------
 mkdocs.yml                                    |  1 +
 pyproject.toml                                |  1 +
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py
index 65a0fc67..39544c4e 100644
--- a/edspdf/pipes/embeddings/huggingface_embedding.py
+++ b/edspdf/pipes/embeddings/huggingface_embedding.py
@@ -166,11 +166,12 @@ def collate(self, batch, device):
         offset = 0
         window_max_size = 0
         window_count = 0
-        windows_per_page = []
+        windows = []
+        windows_count_per_page = []
         for sample_input_ids in batch["input_ids"]:
             for page_input_ids in sample_input_ids:
                 # fmt: off
-                windows_per_page.append([
+                windows.append([
                     [
                         offset + 0,
                         *range(1 + offset + window_i * self.stride,
@@ -179,15 +180,14 @@ def collate(self, batch, device):
                     ]
                     for window_i in range(0, 1 + max(0, math.ceil((len(page_input_ids) - 2 - self.window) / self.stride)))  # noqa: E501
                 ])
+                windows_count_per_page.append(len(windows[-1]))
                 # fmt: on
                 offset += len(page_input_ids)
-                window_max_size = max(
-                    window_max_size, max(map(len, windows_per_page[-1]))
-                )
-                window_count += len(windows_per_page[-1])
+                window_max_size = max(window_max_size, max(map(len, windows[-1])))
+                window_count += len(windows[-1])
 
         windows = as_folded_tensor(
-            windows_per_page,
+            windows,
             full_names=("page", "window", "token"),
             data_dims=("window", "token"),
             dtype=torch.long,
@@ -261,15 +261,19 @@ def collate(self, batch, device):
             "line_window_indices": indexer[line_window_indices].as_tensor(),
             "line_window_offsets_flat": line_window_offsets_flat,
         }
+        print(windows_count_per_page)
         if self.use_image:
-            collated["pixel_values"] = torch.stack(
-                [
-                    torch.as_tensor(x, device=device)
-                    for x in as_folded_tensor(
-                        batch["pixel_values"], **kw, dtype=torch.long
-                    )
-                ],
-                dim=0,
+            collated["pixel_values"] = (
+                torch.stack(
+                    [
+                        torch.from_numpy(page_pixels)
+                        for sample_pages in batch["pixel_values"]
+                        for page_pixels in sample_pages
+                    ],
+                    dim=0,
+                )
+                .repeat_interleave(torch.as_tensor(windows_count_per_page), dim=0)
+                .to(device)
             )
         return collated
 
@@ -278,10 +282,12 @@ def forward(self, batch):
             input_ids=batch["input_ids"].as_tensor()[batch["windows"]],
             bbox=batch["bbox"].as_tensor()[batch["windows"]],
             attention_mask=batch["windows"].mask,
-        ).last_hidden_state
+            pixel_values=batch.get("pixel_values"),
+        ).last_hidden_state[:, : batch["windows"].shape[1]]
+        # TODO offset indices of line_window_indices instead of slicing token_embeddings
         line_embedding = torch.nn.functional.embedding_bag(
             input=batch["line_window_indices"],
-            weight=token_embeddings.view(-1, token_embeddings.size(-1)),
+            weight=token_embeddings.reshape(-1, token_embeddings.size(-1)),
             offsets=batch["line_window_offsets_flat"],
             mode=self.line_pooling,
         )
diff --git a/mkdocs.yml b/mkdocs.yml
index 91db5409..d0d2b362 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -43,6 +43,7 @@ nav:
         - pipes/embeddings/sub-box-cnn-pooler.md
         - pipes/embeddings/box-layout-embedding.md
         - pipes/embeddings/box-transformer.md
+        - pipes/embeddings/huggingface-embedding.md
       - Extractors:
         - pipes/extractors/index.md
         - pipes/extractors/pdfminer.md
diff --git a/pyproject.toml b/pyproject.toml
index 63b906f2..4a1d79bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dev = [
     "coverage>=6.5.0",
     "datasets~=2.10",
     "huggingface_hub>=0.8.1",
+    "transformers~=4.30",
 ]
 docs = [
     "mike~=1.1.2",

From db017099737539b654b8736241fb7c6d003d5abc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Fri, 7 Jul 2023 18:20:40 +0200
Subject: [PATCH 3/7] docs: update mkdocstrings dependencies

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4a1d79bf..03ca1b38 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,12 +49,12 @@ dev = [
 docs = [
     "mike~=1.1.2",
     "mkdocs@git+https://github.com/mkdocs/mkdocs.git@5af8bd30538ff8f0cfb698c8b90c3020da319f92",
-    "mkdocstrings==0.20.0",
+    "mkdocstrings~=0.20",
+    "mkdocstrings-python~=1.1",
     "mkdocs-autorefs@git+https://github.com/percevalw/mkdocs-autorefs.git@0.4.1.post0",
     "mkdocs-gen-files~=0.4.0",
     "mkdocs-literate-nav~=0.6.0",
     "mkdocs-material~=9.1.0",
-    "mkdocstrings-python~=0.8.3",
     "mkdocs-glightbox~=0.3.1",
     "pybtex~=0.24.0",
 ]

From 3ed402a13f7309b2fc42cd59bdbc8f5d7bdf4698 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Mon, 10 Jul 2023 11:28:10 +0200
Subject: [PATCH 4/7] draft

---
 edspdf/pipeline.py                               | 11 ++++++++---
 edspdf/pipes/embeddings/huggingface_embedding.py |  9 ++++-----
 edspdf/structures.py                             |  2 +-
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/edspdf/pipeline.py b/edspdf/pipeline.py
index d1652740..8944ade1 100644
--- a/edspdf/pipeline.py
+++ b/edspdf/pipeline.py
@@ -588,16 +588,21 @@ def collate(
         return batch
 
     def parameters(self):
+        """Returns an iterator over the Pytorch parameters of the components in the
+        pipeline"""
+        return (p for n, p in self.named_parameters())
+
+    def named_parameters(self):
         """Returns an iterator over the Pytorch parameters of the components in the
         pipeline"""
         seen = set()
         for name, component in self.pipeline:
-            if hasattr(component, "parameters"):
-                for param in component.parameters():
+            if hasattr(component, "named_parameters"):
+                for param_name, param in component.named_parameters():
                     if param in seen:
                         continue
                     seen.add(param)
-                    yield param
+                    yield f"{name}.{param_name}", param
 
     def to(self, device: Optional[torch.device] = None):
         """Moves the pipeline to a given device"""
diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py
index 39544c4e..b30c7dfe 100644
--- a/edspdf/pipes/embeddings/huggingface_embedding.py
+++ b/edspdf/pipes/embeddings/huggingface_embedding.py
@@ -256,12 +256,11 @@ def collate(self, batch, device):
         collated = {
             "input_ids": as_folded_tensor(batch["input_ids"], **kw, dtype=torch.long),
             "bbox": as_folded_tensor(batch["bbox"], **kw, dtype=torch.long),
-            "windows": windows,
-            "indexer": indexer[line_window_indices],
-            "line_window_indices": indexer[line_window_indices].as_tensor(),
-            "line_window_offsets_flat": line_window_offsets_flat,
+            "windows": windows.to(device),
+            "indexer": indexer[line_window_indices].to(device),
+            "line_window_indices": indexer[line_window_indices].as_tensor().to(device),
+            "line_window_offsets_flat": line_window_offsets_flat.to(device),
         }
-        print(windows_count_per_page)
         if self.use_image:
             collated["pixel_values"] = (
                 torch.stack(
diff --git a/edspdf/structures.py b/edspdf/structures.py
index 46629a64..ba632bee 100644
--- a/edspdf/structures.py
+++ b/edspdf/structures.py
@@ -201,7 +201,7 @@ class Box(BaseModel):
 
     @property
     def page(self):
-        return self.doc.pages[self.page_num]
+        return next(p for p in self.doc.pages if p.page_num == self.page_num)
 
     def __lt__(self, other):
         self_page_num = self.page_num or 0

From 6b4c6db81c5ee504ad2202be0a764a7154ea17ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Thu, 20 Jul 2023 13:26:37 +0200
Subject: [PATCH 5/7] fix: foldedtensor, box coords as floats, last_page
 feature, unidecode alternative

---
 .../embeddings/box_layout_preprocessor.py     | 45 ++++++-------------
 .../pipes/embeddings/simple_text_embedding.py |  4 +-
 pyproject.toml                                |  3 +-
 3 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/edspdf/pipes/embeddings/box_layout_preprocessor.py b/edspdf/pipes/embeddings/box_layout_preprocessor.py
index fc7b93e1..42363ec4 100644
--- a/edspdf/pipes/embeddings/box_layout_preprocessor.py
+++ b/edspdf/pipes/embeddings/box_layout_preprocessor.py
@@ -1,11 +1,11 @@
-from typing import Any, Dict, Sequence
+from typing import Any, Dict
 
 import torch
 from foldedtensor import FoldedTensor, as_folded_tensor
 from typing_extensions import TypedDict
 
 from edspdf import Pipeline, TrainablePipe, registry
-from edspdf.structures import PDFDoc, TextBox
+from edspdf.structures import PDFDoc
 
 BoxLayoutBatch = TypedDict(
     "BoxLayoutBatch",
@@ -60,27 +60,10 @@ def __init__(
     ):
         super().__init__(pipeline, name)
 
-    def preprocess_boxes(self, boxes: Sequence[TextBox]):
-        box_pages = [box.page.page_num for box in boxes]
-
-        last_page = max(box_pages, default=0)
-
-        return {
-            "page": box_pages,
-            "xmin": [b.x0 for b in boxes],
-            "ymin": [b.y0 for b in boxes],
-            "xmax": [b.x1 for b in boxes],
-            "ymax": [b.y1 for b in boxes],
-            "width": [(b.x1 - b.x0) for b in boxes],
-            "height": [(b.y1 - b.y0) for b in boxes],
-            "first_page": [b.page_num == 0 for b in boxes],
-            "last_page": [b.page_num == last_page for b in boxes],
-        }
-
     def preprocess(self, doc: PDFDoc, supervision: bool = False):
         pages = doc.pages
-        box_pages = [[b.page.page_num for b in page.text_boxes] for page in pages]
-        last_page = max(box_pages, default=0)
+        box_pages = [[b.page_num for b in page.text_boxes] for page in pages]
+        last_p = max((p for x in box_pages for p in x), default=0)
         return {
             "page": box_pages,
             "xmin": [[b.x0 for b in p.text_boxes] for p in pages],
@@ -89,10 +72,8 @@ def preprocess(self, doc: PDFDoc, supervision: bool = False):
             "ymax": [[b.y1 for b in p.text_boxes] for p in pages],
             "width": [[(b.x1 - b.x0) for b in p.text_boxes] for p in pages],
             "height": [[(b.y1 - b.y0) for b in p.text_boxes] for p in pages],
-            "first_page": [[b.page.page_num == 0 for b in p.text_boxes] for p in pages],
-            "last_page": [
-                [b.page.page_num == last_page for b in p.text_boxes] for p in pages
-            ],
+            "first_page": [[b.page_num == 0 for b in p.text_boxes] for p in pages],
+            "last_page": [[b.page_num == last_p for b in p.text_boxes] for p in pages],
         }
 
     def collate(self, batch, device: torch.device) -> BoxLayoutBatch:
@@ -103,13 +84,13 @@ def collate(self, batch, device: torch.device) -> BoxLayoutBatch:
         }
 
         return {
-            "page": as_folded_tensor(batch["page"], dtype=torch.long, **kw),
-            "xmin": as_folded_tensor(batch["xmin"], dtype=torch.long, **kw),
-            "ymin": as_folded_tensor(batch["ymin"], dtype=torch.long, **kw),
-            "xmax": as_folded_tensor(batch["xmax"], dtype=torch.long, **kw),
-            "ymax": as_folded_tensor(batch["ymax"], dtype=torch.long, **kw),
-            "width": as_folded_tensor(batch["width"], dtype=torch.long, **kw),
-            "height": as_folded_tensor(batch["height"], dtype=torch.long, **kw),
+            "page": as_folded_tensor(batch["page"], dtype=torch.float, **kw),
+            "xmin": as_folded_tensor(batch["xmin"], dtype=torch.float, **kw),
+            "ymin": as_folded_tensor(batch["ymin"], dtype=torch.float, **kw),
+            "xmax": as_folded_tensor(batch["xmax"], dtype=torch.float, **kw),
+            "ymax": as_folded_tensor(batch["ymax"], dtype=torch.float, **kw),
+            "width": as_folded_tensor(batch["width"], dtype=torch.float, **kw),
+            "height": as_folded_tensor(batch["height"], dtype=torch.float, **kw),
             "first_page": as_folded_tensor(batch["first_page"], dtype=torch.bool, **kw),
             "last_page": as_folded_tensor(batch["last_page"], dtype=torch.bool, **kw),
         }
diff --git a/edspdf/pipes/embeddings/simple_text_embedding.py b/edspdf/pipes/embeddings/simple_text_embedding.py
index 2b7af282..d849947a 100644
--- a/edspdf/pipes/embeddings/simple_text_embedding.py
+++ b/edspdf/pipes/embeddings/simple_text_embedding.py
@@ -5,6 +5,7 @@
 
 import regex
 import torch
+from anyascii import anyascii
 from foldedtensor import FoldedTensor, as_folded_tensor
 from typing_extensions import TypedDict
 
@@ -208,8 +209,7 @@ def preprocess(self, doc: PDFDoc):
                 words = [m.group(0) for m in self.word_regex.finditer(b.text)]
 
                 for word in words:
-                    # ascii_str = unidecode(word)
-                    ascii_str = word
+                    ascii_str = anyascii(word)
                     tokens_shape[-1][i].append(
                         self.shape_voc.encode(word_shape(ascii_str))
                     )
diff --git a/pyproject.toml b/pyproject.toml
index 03ca1b38..4830223b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,12 +15,13 @@ dynamic = ["version"]
 requires-python = ">3.7.6,<4.0,!=3.8.1"
 
 dependencies = [
+    "anyascii>=0.3.2",
     "scikit-learn>=1.0.2,<2.0.0",
     "pydantic>=1.2,<2.0.0",
     "catalogue~=2.0",
     "networkx~=2.6",
     "confit>=0.2.1,<1.0.0",
-    "foldedtensor>=0.2.1,<1.0.0",
+    "foldedtensor>=0.3.0,<1.0.0",
     "torch>1.0.0",
     "accelerate>=0.12.0,<1.0.0",
     "tqdm~=4.64.1",

From 78f86e779d799aad217970810eb57a8831516483 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Thu, 20 Jul 2023 13:37:42 +0200
Subject: [PATCH 6/7] feat: enable sub-batching in huggingface transformers to
 ease memory usage

---
 .../pipes/embeddings/huggingface_embedding.py | 45 ++++++++++++++-----
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py
index b30c7dfe..88242dc9 100644
--- a/edspdf/pipes/embeddings/huggingface_embedding.py
+++ b/edspdf/pipes/embeddings/huggingface_embedding.py
@@ -88,6 +88,10 @@ class HuggingfaceEmbedding(TrainablePipe[EmbeddingOutput]):
     line_pooling: Literal["mean", "max", "sum"]
         The pooling strategy to use when combining the embeddings of the tokens in a
         line into a single line embedding
+    max_tokens_per_device: int
+        The maximum number of tokens that can be processed by the model on a single
+        device. This does not affect the results but can be used to reduce the memory
+        usage of the model, at the cost of a longer processing time.
     """
 
     def __init__(
@@ -99,6 +103,7 @@ def __init__(
         window: int = 510,
         stride: int = 255,
         line_pooling: Literal["mean", "max", "sum"] = "mean",
+        max_tokens_per_device: int = 128 * 128,
     ):
         super().__init__(pipeline, name)
         self.use_image = use_image
@@ -113,6 +118,7 @@ def __init__(
         self.window = window
         self.stride = stride
         self.line_pooling = line_pooling
+        self.max_tokens_per_device = max_tokens_per_device
 
     def preprocess(self, doc: PDFDoc):
         res = {
@@ -127,7 +133,7 @@ def preprocess(self, doc: PDFDoc):
         for page in doc.pages:
             # Preprocess it using LayoutLMv3
             prep = self.tokenizer(
-                text=[line.text for line in doc.text_boxes],
+                text=[line.text for line in page.text_boxes],
                 boxes=[
                     (
                         int(line.x0 * line.page.width),
@@ -135,9 +141,9 @@ def preprocess(self, doc: PDFDoc):
                         int(line.x1 * line.page.width),
                         int(line.y1 * line.page.height),
                     )
-                    for line in doc.text_boxes
+                    for line in page.text_boxes
                 ],
-                word_labels=range(len(doc.text_boxes)),
+                word_labels=range(len(page.text_boxes)),
                 return_attention_mask=True,
             )
             if self.use_image:
@@ -240,9 +246,10 @@ def collate(self, batch, device):
             data_dims=("token",),
             dtype=torch.long,
         )
+        last_after_one = max(1, len(line_window_offsets_flat) - 1)
         line_window_offsets_flat = as_folded_tensor(
             # discard the last offset, since we start from 0 and add each line length
-            data=torch.as_tensor(line_window_offsets_flat[:-1]),
+            data=torch.as_tensor(line_window_offsets_flat[:last_after_one]),
             data_dims=("line",),
             full_names=("sample", "page", "line"),
             lengths=line_window_indices.lengths[:-1],
@@ -277,13 +284,31 @@ def collate(self, batch, device):
         return collated
 
     def forward(self, batch):
-        token_embeddings = self.hf_model.forward(
-            input_ids=batch["input_ids"].as_tensor()[batch["windows"]],
-            bbox=batch["bbox"].as_tensor()[batch["windows"]],
-            attention_mask=batch["windows"].mask,
+        windows = batch["windows"]
+        kwargs = dict(
+            input_ids=batch["input_ids"].as_tensor()[windows],
+            bbox=batch["bbox"].as_tensor()[windows],
+            attention_mask=windows.mask,
             pixel_values=batch.get("pixel_values"),
-        ).last_hidden_state[:, : batch["windows"].shape[1]]
-        # TODO offset indices of line_window_indices instead of slicing token_embeddings
+        )
+        num_windows_per_batch = self.max_tokens_per_device // windows.shape[1]
+
+        token_embeddings = [
+            self.hf_model.forward(
+                **{
+                    k: None if v is None else v[offset : offset + num_windows_per_batch]
+                    for k, v in kwargs.items()
+                }
+            ).last_hidden_state[:, : windows.shape[1]]
+            # TODO offset line_window_indices during collate
+            #      instead of slicing token_embeddings
+            for offset in range(0, len(windows), num_windows_per_batch)
+        ]
+        token_embeddings = (
+            torch.cat(token_embeddings, dim=0)
+            if len(token_embeddings) > 1
+            else token_embeddings[0]
+        )
         line_embedding = torch.nn.functional.embedding_bag(
             input=batch["line_window_indices"],
             weight=token_embeddings.reshape(-1, token_embeddings.size(-1)),

From f8e9f6f441976ae7dbfc30ba56d38f19ea93e100 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Wed, 26 Jul 2023 08:24:18 +0200
Subject: [PATCH 7/7] fix: edspdf 0.7.0 regression vs article

---
 docs/recipes/training.md                           |  3 ---
 edspdf/pipeline.py                                 | 14 ++++++++++----
 edspdf/pipes/classifiers/trainable.py              | 10 ----------
 edspdf/pipes/embeddings/box_layout_preprocessor.py |  7 ++-----
 edspdf/pipes/embeddings/huggingface_embedding.py   |  1 -
 edspdf/pipes/embeddings/simple_text_embedding.py   |  5 +++--
 edspdf/pipes/embeddings/sub_box_cnn_pooler.py      |  4 +++-
 edspdf/structures.py                               |  1 +
 pyproject.toml                                     |  3 ++-
 tests/core/config.cfg                              |  1 -
 tests/recipes/config.cfg                           |  1 -
 tests/recipes/test_train.py                        |  2 --
 12 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/docs/recipes/training.md b/docs/recipes/training.md
index 539ca543..5ee97433 100644
--- a/docs/recipes/training.md
+++ b/docs/recipes/training.md
@@ -63,7 +63,6 @@ model to decrease a given loss. The process of training a pipeline with EDS-PDF
         config={
             "embedding": model.get_pipe("embedding"),
             "labels": [],
-            "activation": "relu",
         },
     )
     ```
@@ -309,7 +308,6 @@ Let's wrap the training code in a function, and make it callable from the comman
             config={
                 "embedding": model.get_pipe("embedding"),
                 "labels": [],
-                "activation": "relu",
             },
         )
 
@@ -483,7 +481,6 @@ def train_my_model(
 -       config={
 -           "embedding": model.get_pipe("embedding"),
 -           "labels": [],
--           "activation": "relu",
 -       },
 -   )
 
diff --git a/edspdf/pipeline.py b/edspdf/pipeline.py
index 8944ade1..4f66824d 100644
--- a/edspdf/pipeline.py
+++ b/edspdf/pipeline.py
@@ -610,7 +610,6 @@ def to(self, device: Optional[torch.device] = None):
             component.to(device)
         return self
 
-    @contextmanager
     def train(self, mode=True):
         """
         Enables training mode on pytorch modules
@@ -621,12 +620,19 @@ def train(self, mode=True):
             Whether to enable training or not
         """
 
+        class context:
+            def __enter__(self):
+                pass
+
+            def __exit__(ctx_self, type, value, traceback):
+                for name, proc in self.trainable_pipes():
+                    proc.train(was_training[name])
+
         was_training = {name: proc.training for name, proc in self.trainable_pipes()}
         for name, proc in self.trainable_pipes():
             proc.train(mode)
-        yield
-        for name, proc in self.trainable_pipes():
-            proc.train(was_training[name])
+
+        return context()
 
     def score(self, docs: Sequence[PDFDoc], batch_size: int = None) -> Dict[str, Any]:
         """
diff --git a/edspdf/pipes/classifiers/trainable.py b/edspdf/pipes/classifiers/trainable.py
index 239d4f26..db237663 100644
--- a/edspdf/pipes/classifiers/trainable.py
+++ b/edspdf/pipes/classifiers/trainable.py
@@ -15,7 +15,6 @@
 from edspdf.registry import registry
 from edspdf.structures import PDFDoc
 from edspdf.trainable_pipe import Scorer, TrainablePipe
-from edspdf.utils.torch import ActivationFunction, get_activation_function
 
 
 def classifier_scorer(pairs):
@@ -70,7 +69,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]):
                     },
                 },
                 "labels": ["body", "pollution"],
-                "activation": "relu",
             },
         )
         ```
@@ -81,7 +79,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]):
         [components.classifier]
         @factory = "trainable-classifier"
         labels = ["body", "pollution"]
-        activation = "relu"
 
         [components.classifier.embedding]
         @factory = "sub-box-cnn-pooler"
@@ -99,8 +96,6 @@ class TrainableClassifier(TrainablePipe[Dict[str, Any]]):
         Initial labels of the classifier (will be completed during initialization)
     embedding: TrainablePipe[EmbeddingOutput]
         Embedding module to encode the PDF boxes
-    activation: ActivationFunction
-        Name of the activation function
     dropout_p: float
         Dropout probability used on the output of the box and textual encoders
     scorer: Scorer
@@ -111,8 +106,6 @@ def __init__(
         self,
         embedding: TrainablePipe[EmbeddingOutput],
         labels: Sequence[str] = ("pollution",),
-        activation: ActivationFunction = "gelu",
-        dropout_p: float = 0.0,
         scorer: Scorer = classifier_scorer,
         pipeline: Pipeline = None,
         name: str = "trainable-classifier",
@@ -128,9 +121,6 @@ def __init__(
             in_features=self.embedding.output_size,
             out_features=len(self.label_voc),
         )
-        self.activation = get_activation_function(activation)
-        self.dropout = torch.nn.Dropout(dropout_p)
-
         # Scoring function
         self.score = scorer
 
diff --git a/edspdf/pipes/embeddings/box_layout_preprocessor.py b/edspdf/pipes/embeddings/box_layout_preprocessor.py
index 42363ec4..4e1d98c3 100644
--- a/edspdf/pipes/embeddings/box_layout_preprocessor.py
+++ b/edspdf/pipes/embeddings/box_layout_preprocessor.py
@@ -10,7 +10,6 @@
 BoxLayoutBatch = TypedDict(
     "BoxLayoutBatch",
     {
-        "page": FoldedTensor,
         "xmin": FoldedTensor,
         "ymin": FoldedTensor,
         "xmax": FoldedTensor,
@@ -62,10 +61,9 @@ def __init__(
 
     def preprocess(self, doc: PDFDoc, supervision: bool = False):
         pages = doc.pages
-        box_pages = [[b.page_num for b in page.text_boxes] for page in pages]
-        last_p = max((p for x in box_pages for p in x), default=0)
+        [[b.page_num for b in page.text_boxes] for page in pages]
+        last_p = doc.num_pages - 1
         return {
-            "page": box_pages,
             "xmin": [[b.x0 for b in p.text_boxes] for p in pages],
             "ymin": [[b.y0 for b in p.text_boxes] for p in pages],
             "xmax": [[b.x1 for b in p.text_boxes] for p in pages],
@@ -84,7 +82,6 @@ def collate(self, batch, device: torch.device) -> BoxLayoutBatch:
         }
 
         return {
-            "page": as_folded_tensor(batch["page"], dtype=torch.float, **kw),
             "xmin": as_folded_tensor(batch["xmin"], dtype=torch.float, **kw),
             "ymin": as_folded_tensor(batch["ymin"], dtype=torch.float, **kw),
             "xmax": as_folded_tensor(batch["xmax"], dtype=torch.float, **kw),
diff --git a/edspdf/pipes/embeddings/huggingface_embedding.py b/edspdf/pipes/embeddings/huggingface_embedding.py
index 88242dc9..c790d5c1 100644
--- a/edspdf/pipes/embeddings/huggingface_embedding.py
+++ b/edspdf/pipes/embeddings/huggingface_embedding.py
@@ -62,7 +62,6 @@ class HuggingfaceEmbedding(TrainablePipe[EmbeddingOutput]):
         config={
             "embedding": model.get_pipe("embedding"),
             "labels": [],
-            "activation": "relu",
         },
     )
     ```
diff --git a/edspdf/pipes/embeddings/simple_text_embedding.py b/edspdf/pipes/embeddings/simple_text_embedding.py
index d849947a..71a2c5e9 100644
--- a/edspdf/pipes/embeddings/simple_text_embedding.py
+++ b/edspdf/pipes/embeddings/simple_text_embedding.py
@@ -209,7 +209,8 @@ def preprocess(self, doc: PDFDoc):
                 words = [m.group(0) for m in self.word_regex.finditer(b.text)]
 
                 for word in words:
-                    ascii_str = anyascii(word)
+                    # ascii_str = unidecode.unidecode(word)
+                    ascii_str = anyascii(word).strip()
                     tokens_shape[-1][i].append(
                         self.shape_voc.encode(word_shape(ascii_str))
                     )
@@ -253,7 +254,7 @@ def forward(self, batch: BoxTextEmbeddingInputBatch) -> EmbeddingOutput:
             self.shape_embedding(batch["tokens_shape"].as_tensor())
             + self.prefix_embedding(batch["tokens_prefix"].as_tensor())
             + self.suffix_embedding(batch["tokens_suffix"].as_tensor())
-            + self.norm_embedding(batch["tokens_norm"].as_tensor())
+            # + self.norm_embedding(batch["tokens_norm"].as_tensor())
         )
 
         return {"embeddings": batch["tokens_shape"].with_data(text_embeds)}
diff --git a/edspdf/pipes/embeddings/sub_box_cnn_pooler.py b/edspdf/pipes/embeddings/sub_box_cnn_pooler.py
index 849123f1..a917281e 100644
--- a/edspdf/pipes/embeddings/sub_box_cnn_pooler.py
+++ b/edspdf/pipes/embeddings/sub_box_cnn_pooler.py
@@ -98,10 +98,12 @@ def forward(self, batch: Any) -> EmbeddingOutput:
             dim=2,
         )
         pooled = box_token_embeddings.max(1).values
+        pooled = self.linear(pooled)
+        # print("TEXT EMBEDS", pooled.shape, pooled.sum())
 
         return {
             "embeddings": as_folded_tensor(
-                data=self.linear(pooled),
+                data=pooled,
                 lengths=embeddings.lengths[:-1],  # pooled on the last dim
                 data_dims=["line"],  # fully flattened
                 full_names=["sample", "page", "line"],
diff --git a/edspdf/structures.py b/edspdf/structures.py
index ba632bee..a8ab615a 100644
--- a/edspdf/structures.py
+++ b/edspdf/structures.py
@@ -89,6 +89,7 @@ class PDFDoc(BaseModel):
 
     content: bytes = attrs.field(repr=lambda c: f"{len(c)} bytes")
     id: str = None
+    num_pages: int = 0
     pages: List["Page"] = attrs.field(factory=list)
     error: bool = False
     content_boxes: List[Union["TextBox"]] = attrs.field(factory=list)
diff --git a/pyproject.toml b/pyproject.toml
index 4830223b..09e25a4e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,8 @@ dependencies = [
     "pdfminer.six>=20220319",
     "pypdfium2~=2.7",
     "rich-logger>=0.3.0,<1.0.0",
-    "safetensors~=0.3.1"
+    "safetensors~=0.3.1",
+    "anyascii>=0.3.2",
 ]
 
 [project.optional-dependencies]
diff --git a/tests/core/config.cfg b/tests/core/config.cfg
index 0ea43772..d2b240cb 100644
--- a/tests/core/config.cfg
+++ b/tests/core/config.cfg
@@ -11,7 +11,6 @@ components = ${components}
 [components.classifier]
 @factory = "trainable-classifier"
 labels = []
-activation = "relu"
 
 [components.classifier.embedding]
 @factory = "box-transformer"
diff --git a/tests/recipes/config.cfg b/tests/recipes/config.cfg
index c7a61197..10910896 100644
--- a/tests/recipes/config.cfg
+++ b/tests/recipes/config.cfg
@@ -34,7 +34,6 @@ n_layers = 1
 [components.classifier]
 @factory = "trainable-classifier"
 labels = []
-activation = "relu"
 embedding = ${components.embedding}
 
 [components.embedding.embedding]
diff --git a/tests/recipes/test_train.py b/tests/recipes/test_train.py
index e00f6b6b..f27185c6 100644
--- a/tests/recipes/test_train.py
+++ b/tests/recipes/test_train.py
@@ -230,7 +230,6 @@ def test_function(pdf, error_pdf, change_test_dir, dummy_dataset, tmp_path):
         config={
             "embedding": model.get_pipe("embedding"),
             "labels": [],
-            "activation": "relu",
         },
     )
     print(model.config.to_str())
@@ -299,7 +298,6 @@ def test_function_huggingface(pdf, error_pdf, change_test_dir, dummy_dataset, tm
         config={
             "embedding": model.get_pipe("embedding"),
             "labels": [],
-            "activation": "relu",
         },
     )
     trf = model.get_pipe("embedding")