Skip to content

Commit

Permalink
Modern typing annotation: Set, List, Dict -> set, list, dict, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
sveinbjornt committed Aug 22, 2024
1 parent cdba944 commit b26b89e
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 58 deletions.
28 changes: 14 additions & 14 deletions src/tokenizer/abbrev.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"""

from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar
from typing import Generic, Iterator, Optional, TypeVar

from threading import Lock
from collections import defaultdict, OrderedDict
Expand All @@ -56,7 +56,7 @@ class OrderedSet(Generic[_T]):
if a standard Python set() was used."""

def __init__(self) -> None:
self._dict: Dict[_T, None] = OrderedDict()
self._dict: dict[_T, None] = OrderedDict()

def add(self, item: _T) -> None:
"""Add an item at the end of the ordered set"""
Expand All @@ -75,29 +75,29 @@ class Abbreviations:
initialized from the config file"""

# Dictionary of abbreviations and their meanings
DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
# Wrong versions of abbreviations
WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
# All abbreviation meanings
MEANINGS: Set[str] = set()
MEANINGS: set[str] = set()
# Single-word abbreviations, i.e. those with only one dot at the end
SINGLES: Set[str] = set()
SINGLES: set[str] = set()
# Set of abbreviations without periods, e.g. "td", "osfrv"
WRONGSINGLES: Set[str] = set()
WRONGSINGLES: set[str] = set()
# Potential sentence finishers, i.e. those with a dot at the end,
# marked with an asterisk in the config file
FINISHERS: Set[str] = set()
FINISHERS: set[str] = set()
# Abbreviations that should not be seen as such at the end of sentences,
# marked with an exclamation mark in the config file
NOT_FINISHERS: Set[str] = set()
NOT_FINISHERS: set[str] = set()
# Abbreviations that should not be seen as such at the end of sentences, but
# are allowed in front of person names; marked with a hat ^ in the config file
NAME_FINISHERS: Set[str] = set()
NAME_FINISHERS: set[str] = set()
# Wrong versions of abbreviations with possible corrections
# wrong version : [correction1, correction2, ...]
WRONGDOTS: Dict[str, List[str]] = defaultdict(list)
WRONGDOTS: dict[str, list[str]] = defaultdict(list)
# Word forms that should never be interpreted as abbreviations
NOT_ABBREVIATIONS: Set[str] = set()
NOT_ABBREVIATIONS: set[str] = set()

# Ensure that only one thread initializes the abbreviations
_lock = Lock()
Expand Down Expand Up @@ -208,7 +208,7 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
i1 = indices[0]
i2 = indices[1]
i3 = indices[2]
wabbrevs: List[str] = []
wabbrevs: list[str] = []
# 1 and 2 removed
wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :])
# 1 and 3 removed
Expand Down Expand Up @@ -257,7 +257,7 @@ def has_abbreviation(meaning: str) -> bool:
return meaning in Abbreviations.MEANINGS

@staticmethod
def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]:
"""Look up meaning(s) of abbreviation, if available."""
m = Abbreviations.DICT.get(abbrev)
if not m:
Expand Down
13 changes: 5 additions & 8 deletions src/tokenizer/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,10 @@
"""

from typing import (
Dict,
FrozenSet,
Mapping,
Tuple,
Union,
Callable,
List,
Sequence,
Optional,
NamedTuple,
Expand All @@ -47,13 +44,13 @@

BeginTuple = Tuple[int, Optional[int]]
PunctuationTuple = Tuple[int, str]
NumberTuple = Tuple[float, Optional[List[str]], Optional[List[str]]]
NumberTuple = Tuple[float, Optional[list[str]], Optional[list[str]]]
DateTimeTuple = Tuple[int, int, int]
MeasurementTuple = Tuple[str, float]
TimeStampTuple = Tuple[int, int, int, int, int, int]
AmountTuple = Tuple[float, str, Optional[List[str]], Optional[List[str]]]
AmountTuple = Tuple[float, str, Optional[list[str]], Optional[list[str]]]
TelnoTuple = Tuple[str, str]
CurrencyTuple = Tuple[str, Optional[List[str]], Optional[List[str]]]
CurrencyTuple = Tuple[str, Optional[list[str]], Optional[list[str]]]


class BIN_Tuple(NamedTuple):
Expand Down Expand Up @@ -434,7 +431,7 @@ class PersonNameTuple(NamedTuple):
SINGLECHAR_FRACTIONS = "↉⅒⅑⅛⅐⅙⅕¼⅓½⅖⅔⅜⅗¾⅘⅝⅚⅞"

# Derived unit : (base SI unit, conversion factor/function)
SI_UNITS: Dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
SI_UNITS: dict[str, Tuple[str, Union[float, Callable[[float], float]]]] = {
# Distance
"m": ("m", 1.0),
"mm": ("m", 1.0e-3),
Expand Down Expand Up @@ -538,7 +535,7 @@ class PersonNameTuple(NamedTuple):
unit + r"(?!\w)" if unit[-1].isalpha() else unit
)

SI_UNITS_SET: FrozenSet[str] = frozenset(SI_UNITS.keys())
SI_UNITS_SET: frozenset[str] = frozenset(SI_UNITS.keys())
SI_UNITS_REGEX_STRING = r"|".join(
map(
# If the unit ends with a letter, don't allow the next character
Expand Down
12 changes: 6 additions & 6 deletions src/tokenizer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"""

from typing import TextIO, Dict, Iterator, List, Callable, Any, Tuple, Union, cast
from typing import TextIO, Iterator, Callable, Any, Tuple, Union, cast

import sys
import argparse
Expand Down Expand Up @@ -158,14 +158,14 @@ def main() -> None:
"""Main function, called when the tokenize command is invoked"""

args = parser.parse_args()
options: Dict[str, bool] = dict()
options: dict[str, bool] = dict()

def quote(s: str) -> str:
"""Return the string s within double quotes, and with any contained
backslashes and double quotes escaped with a backslash"""
return '"' + s.replace("\\", "\\\\").replace('"', '\\"') + '"'

def spanquote(l: List[int]) -> str:
def spanquote(l: list[int]) -> str:
"""Return the list l as a string within double quotes"""
return '"' + "-".join(str(x) for x in l) + '"'

Expand All @@ -180,7 +180,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
return None
if t.kind == TOK.WORD:
# Get the full expansion of an abbreviation
mm = cast(List[BIN_Tuple], t.val)
mm = cast(list[BIN_Tuple], t.val)
if quote_word:
# Return a |-delimited list of possible meanings,
# joined into a single string
Expand Down Expand Up @@ -254,7 +254,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:

# Configure our JSON dump function
json_dumps = partial(json.dumps, ensure_ascii=False, separators=(",", ":"))
curr_sent: List[str] = []
curr_sent: list[str] = []
tsep = "" if args.original else " " # token separator
for t in tokenize(gen(args.infile), **options):
if args.csv:
Expand All @@ -275,7 +275,7 @@ def val(t: Tok, quote_word: bool = False) -> Any:
print('0,"","","",""', file=args.outfile)
elif args.json:
# Output the tokens in JSON format, one line per token
d: Dict[str, Union[str, List[int]]] = dict(k=TOK.descr[t.kind])
d: dict[str, Union[str, list[int]]] = dict(k=TOK.descr[t.kind])
if t.txt is not None:
d["t"] = t.txt
v = val(t)
Expand Down
Loading

0 comments on commit b26b89e

Please sign in to comment.