-
Notifications
You must be signed in to change notification settings - Fork 0
/
query.php
executable file
·149 lines (131 loc) · 5.23 KB
/
query.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env php
<?php
declare(strict_types=1);
require_once __DIR__.'/vendor/autoload.php';
function sqlite_regexp($pattern, $subject) {
return (preg_match($pattern, $subject) !== 0);
}
$db = new \TDC\PDO\SQLite("ngrams.sqlite", [\PDO::SQLITE_ATTR_OPEN_FLAGS => \PDO::SQLITE_OPEN_READONLY]);
$db->exec("PRAGMA case_sensitive_like = ON");
$db->sqliteCreateFunction('regexp', 'sqlite_regexp', 2);
$pos = [];
$res = $db->prepexec("SELECT u_id, u_text FROM units WHERE (u_text LIKE '%+%' OR u_text REGEXP '~^(N|V|Pali|Conj|Adv|Interj|Pron|Prop|Num|Symbol)(\+|$)~') AND u_text NOT LIKE '%+vv' AND u_text NOT LIKE '%+nv' AND u_text NOT LIKE '%+vn' AND u_text NOT LIKE '%+nn' AND u_text NOT LIKE '%\"%' ORDER BY cnt DESC LIMIT 100");
while ($row = $res->fetch()) {
$pos[$row['u_id']] = $row['u_text'];
}
$txt = $db->prepare("SELECT u_text FROM units WHERE u_id = ?");
$sel_sliding = $db->prepare("SELECT u1, u2, u3, u4, u5, u6, cnt FROM sliding WHERE u1 = ? AND u2 = ? AND u3 = ? AND u4 = ? AND u5 = ? ORDER BY cnt DESC LIMIT 4");
$sel_auto = $db->prepare("SELECT u6 FROM sliding WHERE u1 = ? AND u2 = ? AND u3 = ? AND u4 = ? AND u5 = ? AND u6 IN (".implode(', ', array_keys($pos)).") ORDER BY cnt DESC LIMIT 1");
$sel_units = $db->prepare("SELECT u_id FROM units WHERE u_text LIKE ? ORDER BY cnt DESC LIMIT 100");
// Initialize with most frequent lemmas
$rows = $db->prepexec("SELECT 0 as u1, 0 as u2, 0 as u3, 0 as u4, 0 as u5, u_id as u6, cnt FROM units WHERE u_text REGEXP '~^\"[A-Za-z]~i' ORDER BY cnt DESC LIMIT 4")->fetchAll();
$build = [];
$state = [0, 0, 0, 0, 0];
$auto = 0;
$in = '';
while (42) {
echo "S: ".implode(' ', $state)." ; B: ".implode(' ', $build)." ; IN: $in\n";
$out = [];
// Current state yielded no possible continuations, so try to recover
for ($i=2 ; $i < 6 && empty($rows) ; ++$i) {
$qs = $state;
array_shift($qs);
$us = [];
for ($u=$i ; $u < 6 ; ++$u) {
$us[] = "u{$u} = ?";
}
$us = implode(' AND ', $us);
$rows = $db->prepexec("SELECT u1, u2, u3, u4, u5, u6, cnt FROM sliding WHERE {$us} ORDER BY cnt DESC LIMIT 4", $qs)->fetchAll();
}
foreach ($rows as $row) {
$txt->execute([$row['u6']]);
$nstate = array_values($row);
array_pop($nstate);
array_shift($nstate);
$out[] = [$nstate, $txt->fetch()['u_text'], $row['cnt']];
}
foreach ($out as $k => $o) {
echo "\t#$k: {$o[1]} ({$o[2]}) (".implode(',', $o[0]).")\n";
}
$r = fgets(STDIN);
if (empty($r)) {
break;
}
$in .= trim($r);
if ($r === " \n") {
if ($auto) {
// User accepted the most likely part-of-speech unit
// Fake that by putting it as the first option and selecting that option
$in = "#0";
$out[0] = [$state, $pos[$auto]];
array_shift($out[0][0]);
$out[0][0][] = $auto;
$auto = 0;
}
else {
// We have no good state, so try to recover
}
}
if (preg_match('~#([0-9]+)$~', $in, $m)) {
// User picked a unit from the list
$in = intval($m[1]);
$state = $out[$in][0];
if (!empty($build) && preg_match('~^(TA|AA|")~', $out[$in][1])) {
$emit = trim(shell_exec('echo "'.implode('+', $build).'" | hfst-optimized-lookup -p -u ~/langtech/kal/src/generator-gt-norm.hfstol | grep -vF "?" | head -n 1'));
if ($emit) {
echo "EMIT: $emit\n";
$build = [];
$final = $state[count($state)-1];
foreach ($state as $k => $v) {
if ($v === $final) {
break;
}
$state[$k] = 0;
}
}
}
if ($out[$in][1][0] === '"') {
$build[] = substr($out[$in][1], 1, -1);
}
else {
$build[] = preg_replace('~^(CONJ|ADV)-~', '', preg_replace('~\+([vn][vn])$~', '+Der/$1', $out[$in][1]));
}
$sel_auto->execute($state);
$auto = $sel_auto->fetchColumn(0);
if ($auto) {
$emit = trim(shell_exec('echo "'.implode('+', $build).'+'.$pos[$auto].'" | hfst-optimized-lookup -p -u ~/langtech/kal/src/generator-gt-norm.hfstol | grep -vF "?" | head -n 1'));
echo "$auto: {$pos[$auto]} => {$emit}\n";
}
$sel_sliding->execute($state);
$rows = $sel_sliding->fetchAll();
$in = '';
}
else {
// User typed a letter, so try to find units starting/continuing with that letter
$sel_units->execute(["$in%"]);
$units = $sel_units->fetchAll(PDO::FETCH_COLUMN, 0);
// Exclude currently shown continuations
$u6_not = '';
if (!empty($out)) {
$u6_not = "AND u6 NOT IN (".implode(', ', array_column(array_column($out, 0), 4)).")";
}
if (empty($units)) {
$qs = $state;
$rows = $db->prepexec("SELECT u1, u2, u3, u4, u5, u6, cnt FROM sliding WHERE u1 = ? AND u2 = ? AND u3 = ? AND u4 = ? AND u5 = ? {$u6_not} ORDER BY cnt DESC LIMIT 4", $qs)->fetchAll();
}
else {
//echo "Found ".count($units)." partial units matching $in: ".implode(', ', $units)."\n";
$qs = array_merge($state, $units);
$rows = $db->prepexec("SELECT u1, u2, u3, u4, u5, u6, cnt FROM sliding WHERE u1 = ? AND u2 = ? AND u3 = ? AND u4 = ? AND u5 = ? {$u6_not} AND u6 IN (?".str_repeat(', ?', count($units)-1).") ORDER BY cnt DESC LIMIT 4", $qs)->fetchAll();
for ($i=2 ; $i < 6 && empty($rows) ; ++$i) {
array_shift($qs);
$us = [];
for ($u=$i ; $u < 6 ; ++$u) {
$us[] = "u{$u} = ?";
}
$us = implode(' AND ', $us);
$rows = $db->prepexec("SELECT u1, u2, u3, u4, u5, u6, cnt FROM sliding WHERE {$us} {$u6_not} AND u6 IN (?".str_repeat(', ?', count($units)-1).") ORDER BY cnt DESC LIMIT 4", $qs)->fetchAll();
}
}
}
}