-
Notifications
You must be signed in to change notification settings - Fork 2
/
index.py
158 lines (124 loc) · 4.41 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import re
from csv import DictReader
from unidecode import unidecode
import duckdb
from pyphonetics import Metaphone
WINES_FILE_PATH = "wines.csv"
WINES_DUCKDB_PATH = "wines.duckdb"
IGNORE_REGEXP = "(\\.|[^a-z0-9])+"
def strip_accents(s):
return unidecode(s)
def normalize(s):
"""This is the default way that DuckDB's full-text search extension
normalizes strings with one exception: we retain digits, which are
important for wine vintage."""
return re.sub(IGNORE_REGEXP, " ", strip_accents(s).lower())
def tokenize(s):
return normalize(s).split()
def get_metaphone(s):
"""Returns the Metaphone for a given string.
The Metaphone package ignores non-alphabet characters, including whitespace:
>>> from pyphonetics import Metaphone
>>> m = Metaphone()
>>> m.phonetics("champ onion")
'XMPNN'
>>> m.phonetics("champ")
'XMP'
>>> m.phonetics("onion")
'ONN'"""
return Metaphone().phonetics(s)
def get_metaphone_tokens(s):
"""Tokenizes a string and returns all the tokens' Metaphones.
>>> get_metaphone_tokens("chateau champignon")
['XT', 'XMPNN']"""
return [get_metaphone(t) for t in tokenize(s)]
def get_all_wine_records():
"""Returns a list of dictionary records with names of wines:
[
{'name': 'Bucci Villa Bucci Riserva Verdicchio 2013 750ml'},
{'name': 'Villa Venti "Primo Segno" Sangiovese di Romagna 2019 750ml'},
...
]
"""
with open(WINES_FILE_PATH) as wines_file:
reader = DictReader(open(WINES_FILE_PATH))
return [record for record in reader]
def prepare_indexes(wine_records, token_metaphone_map):
conn = duckdb.connect(WINES_DUCKDB_PATH)
# Create a table containing all our wines
col_defs = ", ".join(
[
"id integer primary key",
"name text not null",
"exact_metaphone text not null",
"metaphone_tokens text not null",
]
)
conn.execute(f"CREATE TABLE wines ({col_defs})")
conn.executemany(
"INSERT INTO wines VALUES (?, ?, ?, ?)",
[
(r["id"], r["name"], r["exact_metaphone"], r["metaphone_tokens"])
for r in wine_records
],
)
# Create a full-text search index on the wine names and metaphone tokens.
#
# - The wine name index is used for accurate transcriptions.
# - The metaphone token index is for our Metaphone Token queries.
#
# We can choose which field to search ('name' or 'metaphone_tokens')
# at query time using DuckDB's FTS extension.
conn.execute(
f"PRAGMA create_fts_index('wines', 'id', 'name', 'metaphone_tokens', ignore='{IGNORE_REGEXP}')"
)
# Create a table containing our token-to-metaphone mapping.
#
# This table is a utility for our Similar Token Metaphone queries.
col_defs = ", ".join(["token text not null", "metaphone text not null"])
conn.execute(f"CREATE TABLE token_metaphones ({col_defs})")
conn.executemany(
f"INSERT INTO token_metaphones VALUES (?, ?)", token_metaphone_map.items()
)
def main():
wine_records = get_all_wine_records()
# First, enrich each record with data we need for our indexes:
#
# [
# ...
# {'id': 1541,
# 'name': 'Bodega Chacra Sin Azufre Pinot Noir 2017 750ml',
# 'exact_metaphone': 'BTKXKRSNSFRPNTNRML',
# 'metaphone_tokens': 'BTK XKR SN ASFR PNT NR ML'}
# ...
# ]
for i, record in enumerate(wine_records):
record["id"] = i + 1 # 1-index the record IDs
record["exact_metaphone"] = get_metaphone(record["name"])
record["metaphone_tokens"] = " ".join(get_metaphone_tokens(record["name"]))
record["tokens"] = tokenize(record["name"])
# Now, create a mapping of each token in our index to its Metaphone -- this
# is for our Similar Token Metaphone query approach:
#
# {
# ...
# 'urbajs': 'URBJS',
# 'urbano': 'URBN',
# 'uva': 'UF',
# 'vaillons': 'FLNS',
# 'vajra': 'FJR',
# 'val': 'FL',
# 'valbuena': 'FLBN',
# 'valentino': 'FLNTN',
# ...
# }
token_metaphone_map = {}
for record in wine_records:
tokens = tokenize(record["name"])
for t in tokens:
if t in token_metaphone_map:
continue
token_metaphone_map[t] = get_metaphone(t)
prepare_indexes(wine_records, token_metaphone_map)
if __name__ == "__main__":
main()