weareprestatech · krishnasism · Feb 20, 2024 · Feb 19, 2024 · Feb 19, 2024 · Feb 19, 2024
diff --git a/README.md b/README.md
@@ -84,6 +84,11 @@ hotpdf_document = HotPdf(pdf_file_path)
 with open(pdf_file_path, "rb") as f:
    hotpdf_document_2 = HotPdf(f)
 
+# Sometimes pdfminer will not replace (cid:x) values properly
+# In that case pass EncodingTypes
+from hotpdf.encodings.types import EncodingTypes
+hotpdf_cid_removal_object = HotPdf(f, cid_overwrite_charset=EncodingTypes.LATIN)
+
 # Get number of pages
 print(len(hotpdf_document.pages))
 

diff --git a/docs/source/guide.rst b/docs/source/guide.rst
@@ -42,6 +42,15 @@ Alternatively, to load a file, you can also defer loading from the constructor a
         hotpdf_document_2.load(f)
 
 
+Sometimes pdfminer.six will not replace (cid:x) values with their corresponding Unicode values.
+In that case, send the charset Encoder.
+
+.. code-block:: python
+
+   from hotpdf.encodings.types import EncodingTypes
+   hotpdf_cid_removal_object = HotPdf(f, cid_overwrite_charset=EncodingTypes.LATIN)
+
+
 The `HotPdf` object has many attributes that you can use to solve your problems. One of them is `pages`, representing each page of the PDF stored in data structures (trie & sparse matrix) to help with text operations.
 Locked PDFs can be loaded passing the password as the password argument:
 

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -42,6 +42,15 @@ Alternatively you can defer loading, and use the `.load()` function instead. The
 
 .. autofunction:: hotpdf.HotPdf.load
 
+Sometimes pdfminer.six will not replace (cid:x) values with their corresponding Unicode values.
+In that case, send the charset Encoder.
+
+.. code-block:: python
+
+   from hotpdf.encodings.types import EncodingTypes
+   hotpdf_cid_removal_object = HotPdf(f, cid_overwrite_charset=EncodingTypes.LATIN)
+
+
 File Operations
 ------------------------------------------
 

diff --git a/hotpdf/encodings/__init__.py b/hotpdf/encodings/__init__.py
diff --git a/hotpdf/encodings/decoder.py b/hotpdf/encodings/decoder.py
@@ -0,0 +1,36 @@
+import re
+from typing import Optional
+
+from ..exceptions.custom_exceptions import DecoderNotInitalised
+from .types import EncodingTypes
+
+
+class Decoder:
+    """If there are no embedded fonts, pdfminer.six has issues with
+    mapping cid values to their specific unicode characters.
+    In that case, we manually override the (cid:int) values
+    """
+
+    __cid_mapping: dict[int, str] = {}
+    initialised: bool = False
+
+    def __init__(self, charset: Optional[EncodingTypes] = None) -> None:
+        if not charset:
+            return
+        if charset.value in EncodingTypes._value2member_map_:
+            self.initialised = True
+        if charset == EncodingTypes.LATIN:
+            from .mappings.latin import CID_TO_STR
+
+            self.__cid_mapping = CID_TO_STR
+        if not self.initialised:
+            raise DecoderNotInitalised("Decoder not initialised")
+
+    def cid_str_to_str(self, cid_str: str) -> str:
+        """Converts a (cid:int) notation to it's corresponding charset unicode
+        In case there's no mapping, return a blank string
+        """
+        cid_digit = re.search(r"\d+", cid_str)
+        if not cid_digit:
+            return cid_str
+        return self.__cid_mapping.get(int(cid_digit.group()), "")
diff --git a/hotpdf/encodings/mappings/__init__.py b/hotpdf/encodings/mappings/__init__.py
diff --git a/hotpdf/encodings/mappings/latin.py b/hotpdf/encodings/mappings/latin.py
@@ -0,0 +1,227 @@
+"""
+This map is extracted from PDF Reference Manual 1.6, pp.925
+  "D.1 Latin Character Set and Encodings"
+
+Format: cid: win
+"""
+
+CID_TO_STR: dict[int, str] = {
+    65: "A",
+    198: "Æ",
+    193: "Á",
+    194: "Â",
+    196: "Ä",
+    192: "À",
+    197: "Å",
+    195: "Ã",
+    66: "B",
+    67: "C",
+    199: "Ç",
+    68: "D",
+    69: "E",
+    201: "É",
+    202: "Ê",
+    203: "Ë",
+    200: "È",
+    208: "Ð",
+    128: "€",
+    70: "F",
+    71: "G",
+    72: "H",
+    73: "I",
+    205: "Í",
+    206: "Î",
+    207: "Ï",
+    204: "Ì",
+    74: "J",
+    75: "K",
+    76: "L",
+    77: "M",
+    78: "N",
+    209: "Ñ",
+    79: "O",
+    140: "Œ",
+    211: "Ó",
+    212: "Ô",
+    214: "Ö",
+    210: "Ò",
+    216: "Ø",
+    213: "Õ",
+    80: "P",
+    81: "Q",
+    82: "R",
+    83: "S",
+    138: "Š",
+    84: "T",
+    222: "Þ",
+    85: "U",
+    218: "Ú",
+    219: "Û",
+    220: "Ü",
+    217: "Ù",
+    86: "V",
+    87: "W",
+    88: "X",
+    89: "Y",
+    221: "Ý",
+    159: "Ÿ",
+    90: "Z",
+    142: "Ž",
+    97: "a",
+    225: "á",
+    226: "â",
+    180: "´",
+    228: "ä",
+    230: "æ",
+    224: "à",
+    38: "&",
+    229: "å",
+    94: "^",
+    126: "~",
+    42: "*",
+    64: "@",
+    227: "ã",
+    98: "b",
+    92: "\\",
+    124: "|",
+    123: "{",
+    125: "}",
+    91: "[",
+    93: "]",
+    166: "¦",
+    149: "•",
+    99: "c",
+    231: "ç",
+    184: "¸",
+    162: "¢",
+    136: "ˆ",
+    58: ":",
+    44: ",",
+    169: "©",
+    164: "¤",
+    100: "d",
+    134: "†",
+    135: "‡",
+    176: "°",
+    168: "¨",
+    247: "÷",
+    36: "$",
+    101: "e",
+    233: "é",
+    234: "ê",
+    235: "ë",
+    232: "è",
+    56: "8",
+    133: "…",
+    151: "—",
+    150: "–",
+    61: "=",
+    240: "ð",
+    33: "!",
+    161: "¡",
+    102: "f",
+    53: "5",
+    131: "ƒ",
+    52: "4",
+    103: "g",
+    223: "ß",
+    96: "`",
+    62: ">",
+    171: "«",
+    187: "»",
+    139: "‹",
+    155: "›",
+    104: "h",
+    45: "-",
+    105: "i",
+    237: "í",
+    238: "î",
+    239: "ï",
+    236: "ì",
+    106: "j",
+    107: "k",
+    108: "l",
+    60: "<",
+    172: "¬",
+    109: "m",
+    175: "¯",
+    181: "µ",
+    215: "×",
+    110: "n",
+    160: " ",
+    57: "9",
+    241: "ñ",
+    35: "#",
+    111: "o",
+    243: "ó",
+    244: "ô",
+    246: "ö",
+    156: "œ",
+    242: "ò",
+    49: "1",
+    189: "½",
+    188: "¼",
+    185: "¹",
+    170: "ª",
+    186: "º",
+    248: "ø",
+    245: "õ",
+    112: "p",
+    182: "¶",
+    40: "(",
+    41: ")",
+    37: "%",
+    46: ".",
+    183: "·",
+    137: "‰",
+    43: "+",
+    177: "±",
+    113: "q",
+    63: "?",
+    191: "¿",
+    34: '"',
+    132: "„",
+    147: "“",
+    148: "”",
+    145: "‘",
+    146: "’",
+    130: "‚",
+    39: "'",
+    114: "r",
+    174: "®",
+    115: "s",
+    154: "š",
+    167: "§",
+    59: ";",
+    55: "7",
+    54: "6",
+    47: "/",
+    32: " ",
+    173: " ",
+    163: "£",
+    116: "t",
+    254: "þ",
+    51: "3",
+    190: "¾",
+    179: "³",
+    152: "˜",
+    153: "™",
+    50: "2",
+    178: "²",
+    117: "u",
+    250: "ú",
+    251: "û",
+    252: "ü",
+    249: "ù",
+    95: "_",
+    118: "v",
+    119: "w",
+    120: "x",
+    121: "y",
+    253: "ý",
+    255: "ÿ",
+    165: "¥",
+    122: "z",
+    158: "ž",
+    48: "0",
+}
diff --git a/hotpdf/encodings/types.py b/hotpdf/encodings/types.py
@@ -0,0 +1,5 @@
+from enum import Enum
+
+
+class EncodingTypes(Enum):
+    LATIN = "latin"
diff --git a/hotpdf/exceptions/__init__.py b/hotpdf/exceptions/__init__.py
diff --git a/hotpdf/exceptions/custom_exceptions.py b/hotpdf/exceptions/custom_exceptions.py
@@ -0,0 +1,2 @@
+class DecoderNotInitalised(Exception):
+    pass
diff --git a/hotpdf/hotpdf.py b/hotpdf/hotpdf.py
@@ -6,6 +6,7 @@
 from typing import Optional, Union
 
 from hotpdf import processor
+from hotpdf.encodings.types import EncodingTypes
 from hotpdf.memory_map import MemoryMap
 from hotpdf.utils import filter_adjacent_coords, intersect
 
@@ -21,6 +22,7 @@ def __init__(
         extraction_tolerance: int = 4,
         laparams: Optional[dict[str, Union[float, bool]]] = None,
         include_annotation_spaces: bool = False,
+        cid_overwrite_charset: Optional[EncodingTypes] = None,
     ) -> None:
         """Initialize the HotPdf class.
 
@@ -33,18 +35,25 @@ def __init__(
                 to adjust the bounding box for capturing text. Defaults to 4.
             laparams (dict[str, Union[float, bool]], optional): Layout parameters for pdfminer.
             include_annotation_spaces (bool, optional): Add annotation spaces to the memory map.
+            cid_overwrite_charset (EncodingTypes, optional): Overwrite encode charset for (cid:x) values
+                that haven't been converted. Default None, will return cid values as is without conversion
 
         Raises:
             ValueError: If the page range is invalid.
             FileNotFoundError: If the file is not found.
             PermissionError: If the file is encrypted or the password is wrong.
-            RuntimeError: If an unkown error is generated by transfotmer.
+            RuntimeError: If an unknown error is generated by transfotmer.
         """
         self.pages: list[MemoryMap] = []
         self.extraction_tolerance: int = extraction_tolerance
         if pdf_file:
             self.load(
-                pdf_file, password, page_numbers, laparams=laparams, include_annotation_spaces=include_annotation_spaces
+                pdf_file,
+                password,
+                page_numbers,
+                laparams=laparams,
+                include_annotation_spaces=include_annotation_spaces,
+                cid_overwrite_charset=cid_overwrite_charset,
             )
 
     def __check_file_exists(self, pdf_file: str) -> None:
@@ -79,6 +88,7 @@ def load(
         page_numbers: Optional[list[int]] = None,
         laparams: Optional[dict[str, Union[float, bool]]] = None,
         include_annotation_spaces: bool = False,
+        cid_overwrite_charset: Optional[EncodingTypes] = None,
     ) -> None:
         """Load a PDF file into memory.
 
@@ -89,14 +99,21 @@ def load(
                 If not provided, will load all pages (default).
             laparams (dict[str, Union[float, bool]], optional): Layout parameters for pdfminer.
             include_annotation_spaces (bool, optional): Add annotation spaces to the memory map.
+            cid_overwrite_charset (EncodingTypes, optional): Overwrite encode charset for (cid:x) values
+                that haven't been converted. Default None, will return cid values as is without conversion
         Raises:
             Exception: If an unknown error is generated by pdfminer.
         """
         page_numbers = page_numbers or []
         self.__prechecks(pdf_file, page_numbers)
         try:
             self.pages = processor.process(
-                pdf_file, password, page_numbers, laparams, include_annotation_spaces=include_annotation_spaces
+                source=pdf_file,
+                password=password,
+                page_numbers=page_numbers,
+                laparams=laparams,
+                include_annotation_spaces=include_annotation_spaces,
+                cid_overwrite_charset=cid_overwrite_charset,
             )
         except Exception as e:
             raise e