Skip to content

Commit

Permalink
ss.SearchQuickestFirst()
Browse files Browse the repository at this point in the history
  • Loading branch information
e-gun committed Oct 13, 2023
1 parent df2dc5f commit 78ac455
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 39 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
### STATUS (v1.2.14):

* 25%-700% faster than HipparchiaServer depending on the function. Uses c. 60% as much RAM.
* monolithic binary: no need for extra files/folders
* monolithic binary: no need for extra files/folders; no need to chase dependencies

### FEATURES

Expand All @@ -21,6 +21,7 @@
* search scope exclusions
* near/not-near syntax
* progress polling
* automatic rewrites of searches to optimize for speed
* semantic vectors
* general
* configurable model parameters
Expand Down
6 changes: 3 additions & 3 deletions fyi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,13 @@ self-test with vectors can be deceptive because `-wc` flag will not override con
43 unique files.
0 files ignored.
github.com/AlDanial/cloc v 1.98 T=0.04 s (1015.3 files/s, 448135.7 lines/s)
github.com/AlDanial/cloc v 1.98 T=0.04 s (1087.2 files/s, 489430.3 lines/s)
-------------------------------------------------------------------------------
Language files blank comment code
-------------------------------------------------------------------------------
Go 43 2795 3100 13084
Go 43 2889 3193 13275
-------------------------------------------------------------------------------
SUM: 43 2795 3100 13084
SUM: 43 2889 3193 13275
-------------------------------------------------------------------------------
```
14 changes: 2 additions & 12 deletions rt-search.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,18 +150,8 @@ func InitializeSearch(c echo.Context, user string) SearchStruct {
srch.IPAddr = c.RealIP()

srch.CleanInput()
srch.SetType() // must happen before SSBuildQueries()

// if BoxA has a lemma and BoxB has a phrase, it is almost certainly faster to search B, then A...
if srch.HasLemmaBoxA && srch.HasPhraseBoxB {
srch.SwapPhraseAndLemma()
}

// all forms of an uncommon word should be sought before all forms of a common word...
if srch.HasLemmaBoxA && srch.HasLemmaBoxB {
srch.PickFastestLemma()
}

srch.SetType() // must happen before SSBuildQueries()
srch.Optimize() // maybe rewrite the search to make it faster
srch.FormatInitialSummary()

if srch.IsVector {
Expand Down
23 changes: 8 additions & 15 deletions searchforanearb.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,6 @@ func XWordsFeeder(ctx context.Context, kvp *[]KVPair, ss *SearchStruct) (<-chan
default:
remainder = len(ss.Queries) - i - 1
if remainder%POLLEVERYNTABLES == 0 {
// ss.Remain.Set(remainder)
SIUpdateRemain <- SIKVi{ss.ID, remainder}
}
emit <- (*kvp)[i]
Expand Down Expand Up @@ -446,11 +445,6 @@ func XWordsCheckFinds(p KVPair, basicprxfinder *regexp.Regexp, submatchsrchfinde
// the default return is "not a hit"
result := -1

// now we have a new problem: Sought all 19 forms of »φύϲιϲ« within 4 words of »ἀδύνατον γὰρ«
// what if the string contains multiple valid values for term #1?
// [291] ϲτερεῶν ἅψηται ὁ πυρετόϲ ἐπειδὴ μὴ ὁμαλῶϲ θερμαίνεται ἀλλὰ ἀνωμάλωϲ εἰϲὶ γάρ τινα μόρια κατὰ φύϲιν ἔχοντα τινὰ δὲ παρὰ φύϲιν ϲυμβαίνει τὰ κατὰ φύϲιν ἔχοντα ἀντιλαμβάνεϲθαι τῶν παρὰ φύϲιν διακειμένων ἀδύνατον γὰρ ὁμαλὴν γενέϲθαι τὴν δυϲκραϲίαν οἱ δὲ ἑκτικῷ κατεϲχημένοι πυρετῷ τοῦτο δέ ἐϲτιν οἱ τὰ ϲτερεὰ πυρέττοντεϲ
//

// quick preliminary test (which does seem to shave 5-10% from your time...)
possible := false
if basicprxfinder.MatchString(p.V) && !notnear {
Expand Down Expand Up @@ -483,12 +477,17 @@ func XWordsCheckFinds(p KVPair, basicprxfinder *regexp.Regexp, submatchsrchfinde

// but we can't build the tail without making another check...

// Sought »ἐϲχάτη χθονόϲ« within 9 words of all 41 forms of »γαῖα«
// in the following we pick up the first »ἐϲχάτη χθονόϲ« of two copies of and set it as the border, but miss a hit if you do not look after the second...
// Example: Sought »ἐϲχάτη χθονόϲ« within 9 words of all 41 forms of »γαῖα«
// with the following "initial hit" we pick up the first of two copies of »ἐϲχάτη χθονόϲ« and set it as the border,
// but you will miss a final hit if you do not continue to look after the second copy since γῆϲ comes after #2
// and therefore >9 wds after the initial hit...

// [9] ὁ ποιητὴϲ ἐνταῦθά φηϲιν οὐ τὰ πρὸϲ ὠκεανὸν ἀλλὰ τὰ ἐκεῖ πρὸϲ τῇ κατὰ νεῖλον θαλάϲϲῃ καθὰ καὶ αἰϲχύλοϲ εἰπών ἔϲτιν πόλιϲ κάνωβοϲ ἐϲχάτη χθονόϲ πᾶϲα γὰρ ἀγχίαλοϲ ἐϲχάτη χθονόϲ διὸ καὶ μενελαϊ/τηϲ νομὸϲ ἐκεῖ ὡϲ τοιαύτηϲ γῆϲ ὑπὸ μενελάῳ ποτὲ γενομένηϲ steph byz ἀπόλλωνοϲ πόλιϲ ἐν αἰγύπτῳ πρὸϲ
// h false νεῖλον θαλάϲϲῃ καθὰ καὶ αἰϲχύλοϲ εἰπών ἔϲτιν πόλιϲ κάνωβοϲ
// t false πᾶϲα γὰρ ἀγχίαλοϲ ἐϲχάτη χθονόϲ διὸ καὶ μενελαϊ/τηϲ
// this split is baked in via RGX above: `^(?P<head>.*?)%s(?P<tail>.*?)$`

// IterativeProxWordsMatching() constructs the solution: if there are N versions of the initial term, build and merge
// N mini environs and return this as the "tail"

checkfordupes := submatchsrchfinder.FindStringSubmatch(tail)

Expand Down Expand Up @@ -517,12 +516,6 @@ func XWordsCheckFinds(p KVPair, basicprxfinder *regexp.Regexp, submatchsrchfinde
}
} else {
// collect hits

// pf := fmt.Sprintf("\n\treg\t%s", basicprxfinder.String())
// pf := ""
// htv := "[%d]\t%s%s\n\t%t\t%s\n\t%t\t%s"
// msg(fmt.Sprintf(htv, p.K, p.V, pf, basicprxfinder.MatchString(head), head, basicprxfinder.MatchString(tail), tail), MSGNOTE)

if basicprxfinder.MatchString(head) || basicprxfinder.MatchString(tail) {
result = p.K
}
Expand Down
85 changes: 77 additions & 8 deletions searchstructs.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,26 @@ func (s *SearchStruct) InclusionOverview(sessincl SearchIncExl) string {
return r
}

// Optimize - consider rewriting the search to make it faster
func (s *SearchStruct) Optimize() {
// if BoxA has a lemma and BoxB has a phrase, it is almost certainly faster to search B, then A...
if s.HasLemmaBoxA && s.HasPhraseBoxB {
s.SwapPhraseAndLemma()
return
}

// all forms of an uncommon word should (usually) be sought before all forms of a common word...
if s.HasLemmaBoxA && s.HasLemmaBoxB {
s.PickFastestLemma()
return
}

// consider looking for the string with more characters in it first
if len(s.Seeking) > 0 && len(s.Proximate) > 0 {
s.SearchQuickestFirst()
}
}

// PickFastestLemma - all forms of an uncommon word should (usually) be sought before all forms of a common word
func (s *SearchStruct) PickFastestLemma() {
// Sought all 65 forms of »δημηγορέω« within 1 lines of all 386 forms of »γιγνώϲκω«
Expand All @@ -281,15 +301,10 @@ func (s *SearchStruct) PickFastestLemma() {
// the penalty for being wrong is relatively low; the savings when you get this right can be significant

const (
FAIL = "PickFastestLemma() called even though this is not a two-lemma search. Aborting."
NOTE1 = "PickFastestLemma() is swapping %s for %s: possible hits %d < %d && known forms %d < %d"
NOTE1 = "PickFastestLemma() is swapping %s for %s: possible hits %d < %d; known forms %d < %d"
NOTE2 = "PickFastestLemma() is NOT swapping %s for %s: possible hits %d vs %d; known forms %d vs %d"
)

if !s.HasLemmaBoxA || !s.HasLemmaBoxB {
msg(FAIL, MSGWARN)
}

hw1 := headwordlookup(s.LemmaOne)
hw2 := headwordlookup(s.LemmaTwo)

Expand All @@ -314,6 +329,8 @@ func (s *SearchStruct) SwapPhraseAndLemma() {
// no SwapPhraseAndLemma(): [Δ: 4.564s] lemma near phrase: 'γαῖα' near 'ἐϲχάτη χθονόϲ'
// yes SwapPhraseAndLemma(): [Δ: 1.276s] lemma near phrase: 'γαῖα' near 'ἐϲχάτη χθονόϲ'

msg("SwapPhraseAndLemma() was called", MSGPEEK)

boxa := s.LemmaOne
boxb := s.Proximate
s.Seeking = boxb
Expand All @@ -337,6 +354,60 @@ func (s *SearchStruct) SwapPhraseAndLemma() {
s.SetType()
}

// SearchQuickestFirst - look for the string with more characters in it first; it will typically generate fewer initial hits
func (s *SearchStruct) SearchQuickestFirst() {
const (
NOTE = "SearchQuickestFirst() swapping '%s' and '%s'"
)

// a long phrase is slower than a single word:
// faster: Sought »ἡδονήν« within 1 lines of »τέλουϲ τῆϲ φιλοϲοφίαϲ«
// slower: Sought »τέλουϲ τῆϲ φιλοϲοφίαϲ« within 1 lines of »ἡδονήν«

isphraseskg := strings.Split(strings.TrimSpace(s.Seeking), " ")
isphraseprx := strings.Split(strings.TrimSpace(s.Proximate), " ")

test1 := len(s.Seeking) < len(s.Proximate)
test2 := len(isphraseskg) == 1 && len(isphraseprx) == 1
test3 := len(isphraseskg) != 1 && len(isphraseprx) != 1
test4 := len(isphraseskg) != 1 || len(isphraseprx) != 1

skg := s.Seeking
prx := s.Proximate

swap := func() {
s.Proximate = skg
s.Seeking = prx
msg(fmt.Sprintf(NOTE, skg, prx), MSGPEEK)
}

// sequence of checks matters... test4 logic can't come until test3 has been cleared

if test1 && test2 {
// two single words
swap()
return
}

if test1 && test3 {
// two phrases
swap()
return
}

if test4 {
// there is a phrase in here somewhere; the other term is a single word because "two phrase" was already tested
if len(isphraseprx) != 1 {
// single word + a phrase
// fastest to do nothing
} else {
// phrase + single word
// quicker to swap because single words beat phrases
swap()
}
}
}

// SortResults - sort the search results by the session's registerselection criterion
func (s *SearchStruct) SortResults() {
// Closures that order the DbWorkline structure:
Expand Down Expand Up @@ -461,7 +532,6 @@ func (sv *SearchVault) SimpleGetSS(id string) SearchStruct {

// Delete - get rid of a search (probably for good, but see "Purge")
func (sv *SearchVault) Delete(id string) {
// msg("SearchVault deleting "+id, 1)
SIDel <- id
sv.mutex.Lock()
defer sv.mutex.Unlock()
Expand All @@ -470,7 +540,6 @@ func (sv *SearchVault) Delete(id string) {

// Purge is just delete; makes the code logic more legible; "Purge" implies that this search is likely to reappear with an "Update"
func (sv *SearchVault) Purge(id string) {
// msg("SearchVault purging "+id, 3)
SIDel <- id
sv.Delete(id)
}
Expand Down

0 comments on commit 78ac455

Please sign in to comment.