Skip to content

Commit

Permalink
Remove Go plugin tokenizers and bundle them natively instead.
Browse files Browse the repository at this point in the history
Go plugins, unfortunately, have severe limitations and are not ideal
for plugins that may need wide distribution.
golang/go#20481

This patch get rids of the tokenizer plugin system entirely and just
bundles the available tokenizers (phonetic: Kannada, Malayalam) into
the core. Widely usable tokenizers can henceforth be bundled into the
core just like how Postgres come with bundled TSVECTOR dictionaries.

Also, it is possible to write custom tokenizers as Postgres plugins and
load them into Postgres dynamically, making the Go tokenizer plugin
system superfluous.
  • Loading branch information
knadh committed Dec 11, 2021
1 parent 230f4e3 commit fd61ee7
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 94 deletions.
9 changes: 2 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,12 @@ deps:

.PHONY: build
build:
go build -o ${BIN} -ldflags="-s -w -X 'main.buildString=${BUILDSTR}'" cmd/${BIN}/*.go
go build -gcflags="-G=3" -o ${BIN} -ldflags="-s -w -X 'main.buildString=${BUILDSTR}'" cmd/${BIN}/*.go

.PHONY: run
run: build build-tokenizers
run:
./${BIN}

.PHONY: build-tokenizers
build-tokenizers:
go build -ldflags="-s -w" -buildmode=plugin -o kannada.tk tokenizers/kannada/kannada.go
go build -ldflags="-s -w" -buildmode=plugin -o malayalam.tk tokenizers/malayalam/malayalam.go

# Compile bin and bundle static assets.
.PHONY: dist
dist: build build-tokenizers
Expand Down
87 changes: 30 additions & 57 deletions cmd/dictmaker/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,43 @@ import (
"fmt"
"html/template"
"io/ioutil"
"log"
"net/http"
"net/url"
"os"
"path/filepath"
"plugin"
"strings"

"github.com/go-chi/chi"
"github.com/go-chi/chi/middleware"
"github.com/jmoiron/sqlx"
"github.com/knadh/dictmaker/internal/data"
"github.com/knadh/dictmaker/tokenizers/indicphone"
"github.com/knadh/koanf"
"github.com/knadh/stuffbin"
)

// connectDB initializes a database connection.
func connectDB(host string, port int, user, pwd, dbName string) (*sqlx.DB, error) {
// initDB initializes a database connection.
func initDB(host string, port int, user, pwd, dbName string) *sqlx.DB {
db, err := sqlx.Connect("postgres",
fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=disable", host, port, user, pwd, dbName))
if err != nil {
return nil, err
logger.Fatalf("error intiializing DB: %v", err)
}

return db, nil
return db
}

// initFileSystem initializes the stuffbin FileSystem to provide
// initFS initializes the stuffbin FileSystem to provide
// access to bunded static assets to the app.
func initFileSystem() (stuffbin.FileSystem, error) {
func initFS() stuffbin.FileSystem {
path, err := os.Executable()
if err != nil {
return nil, err
logger.Fatalf("error getting executable path: %v", err)
}

fs, err := stuffbin.UnStuff(path)
if err == nil {
return fs, nil
return fs
}

// Running in local mode. Load the required static assets into
Expand All @@ -58,10 +57,10 @@ func initFileSystem() (stuffbin.FileSystem, error) {

fs, err = stuffbin.NewLocalFS("/", files...)
if err != nil {
return nil, fmt.Errorf("failed to initialize local file for assets: %v", err)
logger.Fatalf("failed to initialize local file for assets: %v", err)
}

return fs, nil
return fs
}

// loadSiteTheme loads a theme from a directory.
Expand Down Expand Up @@ -104,36 +103,16 @@ func loadSiteTheme(path string, loadPages bool) (*template.Template, error) {
func initAdminTemplates(path string) *template.Template {
t, err := template.New("admin").ParseGlob(path + "/*.html")
if err != nil {
log.Fatalf("error loading admin templates: %v", err)
logger.Fatalf("error loading admin templates: %v", err)
}
return t
}

// loadTokenizerPlugin loads a tokenizer plugin that implements data.Tokenizer
// from the given path.
func loadTokenizerPlugin(path string) (data.Tokenizer, error) {
plg, err := plugin.Open(path)
if err != nil {
return nil, fmt.Errorf("error loading tokenizer plugin '%s': %v", path, err)
}

newFunc, err := plg.Lookup("New")
if err != nil {
return nil, fmt.Errorf("New() function not found in plugin '%s': %v", path, err)
}

f, ok := newFunc.(func() (data.Tokenizer, error))
if !ok {
return nil, fmt.Errorf("New() function is of invalid type in plugin '%s'", path)
// initTokenizers initializes all bundled tokenizers.
func initTokenizers() map[string]data.Tokenizer {
return map[string]data.Tokenizer{
"indicphone": indicphone.New(),
}

// Initialize the plugin.
p, err := f()
if err != nil {
return nil, fmt.Errorf("error initializing provider plugin '%s': %v", path, err)
}

return p, err
}

// initHandlers registers HTTP handlers.
Expand Down Expand Up @@ -183,32 +162,29 @@ func initHandlers(r *chi.Mux, app *App) {

// initLangs loads language configuration into a given *App instance.
func initLangs(ko *koanf.Koanf) data.LangMap {
out := make(data.LangMap)
var (
tks = initTokenizers()
out = make(data.LangMap)
)

// Language configuration.
for _, l := range ko.MapKeys("lang") {
lang := data.Lang{Types: make(map[string]string)}
if err := ko.UnmarshalWithConf("lang."+l, &lang, koanf.UnmarshalConf{Tag: "json"}); err != nil {
log.Fatalf("error loading languages: %v", err)
logger.Fatalf("error loading languages: %v", err)
}

// Load external plugin.
logger.Printf("language: %s", l)

if lang.TokenizerType == "plugin" {
tk, err := loadTokenizerPlugin(lang.TokenizerName)
if err != nil {
log.Fatalf("error loading tokenizer plugin for %s: %v", l, err)
// Does the language use a bundled tokenizer?
if lang.TokenizerType == "custom" {
t, ok := tks[lang.TokenizerName]
if !ok {
logger.Fatalf("unknown custom tokenizer '%s'", lang.TokenizerName)
}

lang.Tokenizer = tk

// Tokenizations for search queries are looked up by the tokenizer
// ID() returned by the plugin and not the filename in the config.
lang.TokenizerName = tk.ID()
logger.Printf("loaded tokenizer %s", lang.TokenizerName)
lang.Tokenizer = t
}

// Load external plugin.
logger.Printf("language: %s", l)
out[l] = lang
}

Expand All @@ -222,10 +198,7 @@ func generateNewFiles() error {

// Initialize the static file system into which all
// required static assets (.sql, .js files etc.) are loaded.
fs, err := initFileSystem()
if err != nil {
return err
}
fs := initFS()

// Generate config file.
b, err := fs.Read("config.toml.sample")
Expand Down
17 changes: 4 additions & 13 deletions cmd/dictmaker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func init() {
f.Bool("version", false, "current version of the build")

if err := f.Parse(os.Args[1:]); err != nil {
log.Fatalf("error parsing flags: %v", err)
logger.Fatalf("error parsing flags: %v", err)
}

if ok, _ := f.GetBool("version"); ok {
Expand Down Expand Up @@ -113,31 +113,22 @@ func init() {

func main() {
// Connect to the DB.
db, err := connectDB(ko.String("db.host"),
db := initDB(ko.String("db.host"),
ko.Int("db.port"),
ko.String("db.user"),
ko.String("db.password"),
ko.String("db.db"),
)
if err != nil {
logger.Fatalf("error connecting to DB: %v", err)
}

defer db.Close()

fs, err := initFileSystem()
if err != nil {
logger.Fatal(err)
}

// Initialize the app context that's passed around.
app := &App{
constants: constants{
Site: ko.String("site"),
RootURL: ko.String("app.root_url"),
},
db: db,
fs: fs,
fs: initFS(),
logger: logger,
}

Expand All @@ -148,7 +139,7 @@ func main() {
}

// Load SQL queries.
qB, err := fs.Read("/queries.sql")
qB, err := app.fs.Read("/queries.sql")
if err != nil {
logger.Fatalf("error reading queries.sql: %v", err)
}
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module github.com/knadh/dictmaker

go 1.12
go 1.17

require (
github.com/go-chi/chi v4.1.2+incompatible
Expand All @@ -10,13 +10,13 @@ require (
github.com/knadh/koanf v0.15.0
github.com/knadh/paginator v0.0.0-20210310070812-ae09d514e148
github.com/knadh/stuffbin v1.1.0
github.com/kr/pretty v0.1.0 // indirect
github.com/lib/pq v1.10.0
github.com/mitchellh/mapstructure v1.4.1 // indirect
github.com/pelletier/go-toml v1.8.1 // indirect
github.com/spf13/pflag v1.0.5
gitlab.com/joice/mlphone-go v0.0.0-20201001084309-2bb02984eed8
golang.org/x/sys v0.0.0-20210326220804-49726bf1d181 // indirect
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b
)

require github.com/fsnotify/fsnotify v1.4.9 // indirect
9 changes: 2 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,6 @@ github.com/knadh/paginator v0.0.0-20210310070812-ae09d514e148 h1:5KojMX5qCcq89QL
github.com/knadh/paginator v0.0.0-20210310070812-ae09d514e148/go.mod h1:80FK5OPRRQQKEK75ahG+92/MdX/lu4dE8loTzJRVcCQ=
github.com/knadh/stuffbin v1.1.0 h1:f5S5BHzZALjuJEgTIOMC9NidEnBJM7Ze6Lu1GHR/lwU=
github.com/knadh/stuffbin v1.1.0/go.mod h1:yVCFaWaKPubSNibBsTAJ939q2ABHudJQxRWZWV5yh+4=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/lib/pq v1.10.0 h1:Zx5DJFEYQXio93kgXnQ09fXNiUKsqv4OUEu2UtGcB1E=
github.com/lib/pq v1.10.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
Expand Down Expand Up @@ -113,6 +108,7 @@ golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73r
golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand All @@ -130,6 +126,7 @@ golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20210326220804-49726bf1d181 h1:64ChN/hjER/taL4YJuA+gpLfIMT+/NFherRZixbxOhg=
golang.org/x/sys v0.0.0-20210326220804-49726bf1d181/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db h1:6/JqlYfC1CCaLnGceQTI+sDGhC9UBSPAsBqI0Gun6kU=
golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
Expand All @@ -146,8 +143,6 @@ google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZi
google.golang.org/grpc v1.22.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
gopkg.in/asn1-ber.v1 v1.0.0-20181015200546-f715ec2f112d/go.mod h1:cuepJuh7vyXfUyUwEgHQXw849cJrilpS5NeIjOWESAw=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/square/go-jose.v2 v2.3.1/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI=
gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b h1:P+3+n9hUbqSDkSdtusWHVPQRrpRpLiLFzlZ02xXskM0=
gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b/go.mod h1:0LRKfykySnChgQpG3Qpk+bkZFWazQ+MMfc5oldQCwnY=
Expand Down
19 changes: 12 additions & 7 deletions internal/data/data.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,13 @@ type LangMap map[string]Lang
// Tokenizer represents a function that takes a string
// and returns a list of Postgres tsvector tokens.
type Tokenizer interface {
ID() string
Name() string

// Tokenize takes a string and tokenizes it into a list of tsvector tokens
// that can be stored in the database for fulltext search.
ToTokens(string) []string
ToTokens(s string, lang string) ([]string, error)

// ToTSQuery takes a search string and returns a Postgres tsquery string,
// for example 'fat & cat`.
ToQuery(string) string
ToQuery(s string, lang string) (string, error)
}

// Token represents a Postgres tsvector token.
Expand Down Expand Up @@ -182,7 +179,11 @@ func (d *Data) Search(q Query) (Entries, int, error) {
} else {
// If there's an external tokenizer loaded, run it to get the tokens
// and pass it to the DB directly instructing the DB not to tokenize internally.
tsVectorQuery = tk.ToQuery(q.Query)
var err error
tsVectorQuery, err = tk.ToQuery(q.Query, q.FromLang)
if err != nil {
return nil, 0, err
}
}

// Filters ($1 to $3)
Expand Down Expand Up @@ -293,7 +294,11 @@ func (d *Data) InsertEntry(e Entry) (int, error) {
} else {
// If there's an external tokenizer loaded, run it to get the tokens
// and pass it to the DB directly instructing the DB not to tokenize internally.
tokens = strings.Join(lang.Tokenizer.ToTokens(e.Content), " ")
t, err := lang.Tokenizer.ToTokens(e.Content, e.Lang)
if err != nil {
return 0, nil
}
tokens = strings.Join(t, " ")
}
}

Expand Down
Loading

0 comments on commit fd61ee7

Please sign in to comment.