Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: Replace (cid:x) values for Latin charset #111

Merged
merged 11 commits into from
Feb 20, 2024
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ hotpdf_document = HotPdf(pdf_file_path)
with open(pdf_file_path, "rb") as f:
hotpdf_document_2 = HotPdf(f)

# Sometimes pdfminer will not replace (cid:x) values properly
# In that case pass EncodingTypes
from hotpdf.encodings.types import EncodingTypes
hotpdf_cid_removal_object = HotPdf(f, cid_overwrite_charset=EncodingTypes.LATIN)

# Get number of pages
print(len(hotpdf_document.pages))

Expand Down
9 changes: 9 additions & 0 deletions docs/source/guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ Alternatively, to load a file, you can also defer loading from the constructor a
hotpdf_document_2.load(f)


Sometimes pdfminer.six will not replace (cid:x) values with their corresponding Unicode values.
In that case, send the charset Encoder.

.. code-block:: python

from hotpdf.encodings.types import EncodingTypes
hotpdf_cid_removal_object = HotPdf(f, cid_overwrite_charset=EncodingTypes.LATIN)


The `HotPdf` object has many attributes that you can use to solve your problems. One of them is `pages`, representing each page of the PDF stored in data structures (trie & sparse matrix) to help with text operations.
Locked PDFs can be loaded passing the password as the password argument:

Expand Down
9 changes: 9 additions & 0 deletions docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,15 @@ Alternatively you can defer loading, and use the `.load()` function instead. The

.. autofunction:: hotpdf.HotPdf.load

Sometimes pdfminer.six will not replace (cid:x) values with their corresponding Unicode values.
In that case, send the charset Encoder.

.. code-block:: python

from hotpdf.encodings.types import EncodingTypes
hotpdf_cid_removal_object = HotPdf(f, cid_overwrite_charset=EncodingTypes.LATIN)


File Operations
------------------------------------------

Expand Down
Empty file added hotpdf/encodings/__init__.py
Empty file.
36 changes: 36 additions & 0 deletions hotpdf/encodings/decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import re
from typing import Optional

from ..exceptions.custom_exceptions import DecoderNotInitalised
from .types import EncodingTypes


class Decoder:
"""If there are no embedded fonts, pdfminer.six has issues with
mapping cid values to their specific unicode characters.
In that case, we manually override the (cid:int) values
"""

__cid_mapping: dict[int, str] = {}
initialised: bool = False

def __init__(self, charset: Optional[EncodingTypes] = None) -> None:
if not charset:
return
if charset.value in EncodingTypes._value2member_map_:
self.initialised = True
if charset == EncodingTypes.LATIN:
from .mappings.latin import CID_TO_STR

self.__cid_mapping = CID_TO_STR
krishnasism marked this conversation as resolved.
Show resolved Hide resolved
if not self.initialised:
raise DecoderNotInitalised("Decoder not initialised")

def cid_str_to_str(self, cid_str: str) -> str:
"""Converts a (cid:int) notation to it's corresponding charset unicode
In case there's no mapping, return a blank string
"""
cid_digit = re.search(r"\d+", cid_str)
if not cid_digit:
return cid_str
return self.__cid_mapping.get(int(cid_digit.group()), "")
krishnasism marked this conversation as resolved.
Show resolved Hide resolved
Empty file.
227 changes: 227 additions & 0 deletions hotpdf/encodings/mappings/latin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
"""
This map is extracted from PDF Reference Manual 1.6, pp.925
"D.1 Latin Character Set and Encodings"

Format: cid: win
"""

CID_TO_STR: dict[int, str] = {
65: "A",
198: "Æ",
193: "Á",
194: "Â",
196: "Ä",
192: "À",
197: "Å",
195: "Ã",
66: "B",
67: "C",
199: "Ç",
68: "D",
69: "E",
201: "É",
202: "Ê",
203: "Ë",
200: "È",
208: "Ð",
128: "€",
70: "F",
71: "G",
72: "H",
73: "I",
205: "Í",
206: "Î",
207: "Ï",
204: "Ì",
74: "J",
75: "K",
76: "L",
77: "M",
78: "N",
209: "Ñ",
79: "O",
140: "Œ",
211: "Ó",
212: "Ô",
214: "Ö",
210: "Ò",
216: "Ø",
213: "Õ",
80: "P",
81: "Q",
82: "R",
83: "S",
138: "Š",
84: "T",
222: "Þ",
85: "U",
218: "Ú",
219: "Û",
220: "Ü",
217: "Ù",
86: "V",
87: "W",
88: "X",
89: "Y",
221: "Ý",
159: "Ÿ",
90: "Z",
142: "Ž",
97: "a",
225: "á",
226: "â",
180: "´",
228: "ä",
230: "æ",
224: "à",
38: "&",
229: "å",
94: "^",
126: "~",
42: "*",
64: "@",
227: "ã",
98: "b",
92: "\\",
124: "|",
123: "{",
125: "}",
91: "[",
93: "]",
166: "¦",
149: "•",
99: "c",
231: "ç",
184: "¸",
162: "¢",
136: "ˆ",
58: ":",
44: ",",
169: "©",
164: "¤",
100: "d",
134: "†",
135: "‡",
176: "°",
168: "¨",
247: "÷",
36: "$",
101: "e",
233: "é",
234: "ê",
235: "ë",
232: "è",
56: "8",
133: "…",
151: "—",
150: "–",
61: "=",
240: "ð",
33: "!",
161: "¡",
102: "f",
53: "5",
131: "ƒ",
52: "4",
103: "g",
223: "ß",
96: "`",
62: ">",
171: "«",
187: "»",
139: "‹",
155: "›",
104: "h",
45: "-",
105: "i",
237: "í",
238: "î",
239: "ï",
236: "ì",
106: "j",
107: "k",
108: "l",
60: "<",
172: "¬",
109: "m",
175: "¯",
181: "µ",
215: "×",
110: "n",
160: " ",
57: "9",
241: "ñ",
35: "#",
111: "o",
243: "ó",
244: "ô",
246: "ö",
156: "œ",
242: "ò",
49: "1",
189: "½",
188: "¼",
185: "¹",
170: "ª",
186: "º",
248: "ø",
245: "õ",
112: "p",
182: "¶",
40: "(",
41: ")",
37: "%",
46: ".",
183: "·",
137: "‰",
43: "+",
177: "±",
113: "q",
63: "?",
191: "¿",
34: '"',
132: "„",
147: "“",
148: "”",
145: "‘",
146: "’",
130: "‚",
39: "'",
114: "r",
174: "®",
115: "s",
154: "š",
167: "§",
59: ";",
55: "7",
54: "6",
47: "/",
32: " ",
173: " ",
163: "£",
116: "t",
254: "þ",
51: "3",
190: "¾",
179: "³",
152: "˜",
153: "™",
50: "2",
178: "²",
117: "u",
250: "ú",
251: "û",
252: "ü",
249: "ù",
95: "_",
118: "v",
119: "w",
120: "x",
121: "y",
253: "ý",
255: "ÿ",
165: "¥",
122: "z",
158: "ž",
48: "0",
}
5 changes: 5 additions & 0 deletions hotpdf/encodings/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from enum import Enum


class EncodingTypes(Enum):
LATIN = "latin"
Empty file added hotpdf/exceptions/__init__.py
Empty file.
2 changes: 2 additions & 0 deletions hotpdf/exceptions/custom_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class DecoderNotInitalised(Exception):
pass
23 changes: 20 additions & 3 deletions hotpdf/hotpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Optional, Union

from hotpdf import processor
from hotpdf.encodings.types import EncodingTypes
from hotpdf.memory_map import MemoryMap
from hotpdf.utils import filter_adjacent_coords, intersect

Expand All @@ -21,6 +22,7 @@ def __init__(
extraction_tolerance: int = 4,
laparams: Optional[dict[str, Union[float, bool]]] = None,
include_annotation_spaces: bool = False,
cid_overwrite_charset: Optional[EncodingTypes] = None,
) -> None:
"""Initialize the HotPdf class.

Expand All @@ -33,18 +35,25 @@ def __init__(
to adjust the bounding box for capturing text. Defaults to 4.
laparams (dict[str, Union[float, bool]], optional): Layout parameters for pdfminer.
include_annotation_spaces (bool, optional): Add annotation spaces to the memory map.
cid_overwrite_charset (EncodingTypes, optional): Overwrite encode charset for (cid:x) values
that haven't been converted. Default None, will return cid values as is without conversion

Raises:
ValueError: If the page range is invalid.
FileNotFoundError: If the file is not found.
PermissionError: If the file is encrypted or the password is wrong.
RuntimeError: If an unkown error is generated by transfotmer.
RuntimeError: If an unknown error is generated by transfotmer.
"""
self.pages: list[MemoryMap] = []
self.extraction_tolerance: int = extraction_tolerance
if pdf_file:
self.load(
pdf_file, password, page_numbers, laparams=laparams, include_annotation_spaces=include_annotation_spaces
pdf_file,
password,
page_numbers,
laparams=laparams,
include_annotation_spaces=include_annotation_spaces,
cid_overwrite_charset=cid_overwrite_charset,
)

def __check_file_exists(self, pdf_file: str) -> None:
Expand Down Expand Up @@ -79,6 +88,7 @@ def load(
page_numbers: Optional[list[int]] = None,
laparams: Optional[dict[str, Union[float, bool]]] = None,
include_annotation_spaces: bool = False,
cid_overwrite_charset: Optional[EncodingTypes] = None,
) -> None:
"""Load a PDF file into memory.

Expand All @@ -89,14 +99,21 @@ def load(
If not provided, will load all pages (default).
laparams (dict[str, Union[float, bool]], optional): Layout parameters for pdfminer.
include_annotation_spaces (bool, optional): Add annotation spaces to the memory map.
cid_overwrite_charset (EncodingTypes, optional): Overwrite encode charset for (cid:x) values
that haven't been converted. Default None, will return cid values as is without conversion
Raises:
Exception: If an unknown error is generated by pdfminer.
"""
page_numbers = page_numbers or []
self.__prechecks(pdf_file, page_numbers)
try:
self.pages = processor.process(
pdf_file, password, page_numbers, laparams, include_annotation_spaces=include_annotation_spaces
source=pdf_file,
password=password,
page_numbers=page_numbers,
laparams=laparams,
include_annotation_spaces=include_annotation_spaces,
cid_overwrite_charset=cid_overwrite_charset,
)
except Exception as e:
raise e
Expand Down
Loading