diff --git a/build.gradle b/build.gradle index cdf7655..7bac38a 100644 --- a/build.gradle +++ b/build.gradle @@ -3,11 +3,10 @@ apply plugin: 'groovy' apply plugin: 'maven' group = 'org.sotnya' -version = '1.0.1' +version = '1.0.2' sourceCompatibility = "1.8" targetCompatibility = "1.8" - description = """Ukrainian lemmatizer plugin for ElasticSearch""" repositories { @@ -19,14 +18,14 @@ repositories { } dependencies { - compile 'org.elasticsearch:elasticsearch:1.7.4' - compile 'junit:junit:4.12' - - testCompile 'org.apache.lucene:lucene-test-framework:4.10.4' - testCompile 'org.elasticsearch:elasticsearch:1.7.4:tests' - testCompile 'com.google.guava:guava:18.0' - testCompile 'com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.14' - testCompile 'org.hamcrest:hamcrest-all:1.3' + compile group: 'org.elasticsearch', name: 'elasticsearch', version:'1.7.5' + + testCompile group: 'junit', name: 'junit', version: '4.12' + testCompile group: 'org.apache.lucene', name: 'lucene-test-framework', version: '4.10.4' + testCompile group: 'org.elasticsearch', name: 'elasticsearch', version: '1.7.5', classifier: 'tests' + testCompile group: 'com.google.guava', name: 'guava', version: '19.0' + testCompile group: 'com.carrotsearch.randomizedtesting', name: 'randomizedtesting-runner', version: '2.1.14' + testCompile group: 'org.hamcrest', name: 'hamcrest-library', version: '1.3' } File explodedDistDir = new File(distsDir, 'exploded') @@ -41,21 +40,30 @@ task explodedDist(dependsOn: [jar], description: 'Builds the plugin zip file') < into explodedDistDir } - ant.delete { - fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar") - } + ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "asm*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "antlr*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "compress*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "commons-cli*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "guava*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "hamcrest*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "HdrHistogram*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "hppc*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "jackson*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "joda*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "jsr*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "junit*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "lucene-*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "netty-*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "snakeyaml*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "spatial4j-*.jar") } + ant.delete { fileset(dir: explodedDistDir, includes: "t-digest*.jar") } copy { from libsDir into explodedDistDir } - - ant.delete { fileset(dir: explodedDistDir, includes: "lucene-*.jar") } - ant.delete { fileset(dir: explodedDistDir, includes: "spatial4j-*.jar") } - ant.delete { fileset(dir: explodedDistDir, includes: "hamcrest*.jar") } - ant.delete { fileset(dir: explodedDistDir, includes: "junit*.jar") } - ant.delete { fileset(dir: explodedDistDir, includes: "asm*.jar") } - ant.delete { fileset(dir: explodedDistDir, includes: "antlr*.jar") } } task zip(type: Zip, dependsOn: ['explodedDist']) { diff --git a/src/main/java/org/elasticsearch/index/analysis/ukrainian_lemmatizer/UkrainianLemmatizerTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/ukrainian_lemmatizer/UkrainianLemmatizerTokenFilter.java index 8eb25e0..faf580c 100644 --- a/src/main/java/org/elasticsearch/index/analysis/ukrainian_lemmatizer/UkrainianLemmatizerTokenFilter.java +++ b/src/main/java/org/elasticsearch/index/analysis/ukrainian_lemmatizer/UkrainianLemmatizerTokenFilter.java @@ -8,10 +8,13 @@ import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.ESLoggerFactory; import org.sotnya.lemmatizer.uk.engine.UkrainianLemmatizer; public class UkrainianLemmatizerTokenFilter extends TokenFilter { private UkrainianLemmatizer lemmatizer = null; + private final ESLogger logger = ESLoggerFactory.getLogger(this.getClass().getSimpleName()); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); @@ -30,12 +33,16 @@ public final boolean incrementToken() throws IOException { return false; } + logger.debug(String.format("Looking for term %s.", termAtt)); + Optional lemma = lemmatizer.lemmatize(termAtt); if (lemma.isPresent()) { if(!keywordAttr.isKeyword() && !equalCharSequences(lemma.get(), termAtt)) { termAtt.setEmpty().append(lemma.get()); } + + logger.debug(String.format("Found lemma %s", lemma.get())); } return true; @@ -46,9 +53,8 @@ public final boolean incrementToken() throws IOException { */ private boolean equalCharSequences(CharSequence s1, CharSequence s2) { int len1 = s1.length(); - int len2 = s2.length(); - if (len1 != len2) return false; + if (len1 != s2.length()) return false; for (int i = len1; --i >= 0; ) { if (s1.charAt(i) != s2.charAt(i)) { diff --git a/src/main/java/org/sotnya/lemmatizer/uk/engine/UkrainianLemmatizer.java b/src/main/java/org/sotnya/lemmatizer/uk/engine/UkrainianLemmatizer.java index c57b09f..add1a00 100644 --- a/src/main/java/org/sotnya/lemmatizer/uk/engine/UkrainianLemmatizer.java +++ b/src/main/java/org/sotnya/lemmatizer/uk/engine/UkrainianLemmatizer.java @@ -1,6 +1,8 @@ package org.sotnya.lemmatizer.uk.engine; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.ESLoggerFactory; import java.io.*; import java.util.HashMap; @@ -13,6 +15,7 @@ * Handles mapping retrieval, terms normalisation and lookup for proper lemmas in mapping. */ public class UkrainianLemmatizer { + private static final ESLogger logger = ESLoggerFactory.getLogger(UkrainianLemmatizer.class.getSimpleName()); private static final Map dictionary; /** * Before the lookup we replace some symbols to their alternatives are being used in mapping. @@ -25,10 +28,13 @@ public class UkrainianLemmatizer { static { // load mapping from file (must be changed to faster and memory-efficient type) - final InputStream is = UkrainianLemmatizer.class.getClassLoader().getResourceAsStream("mapping_sorted.csv"); + final String fileName = "mapping_sorted.csv"; final String separator = ","; + final ClassLoader loader = UkrainianLemmatizer.class.getClassLoader(); - try (BufferedReader reader = new BufferedReader(new InputStreamReader(is))) { + logger.debug("Started loading dictionary"); + + try (BufferedReader reader = new BufferedReader(new InputStreamReader(loader.getResourceAsStream(fileName), "UTF8"))) { dictionary = reader.lines() .map(line -> line.split(separator)) .collect(Collectors.toMap(p -> p[0], p -> p[1])); @@ -36,6 +42,7 @@ public class UkrainianLemmatizer { throw new UncheckedIOException(e); } + logger.debug(String.format("Finished loading dictionary. %d entities delivered gently.", dictionary.size())); // Let's wait some additional second but we'll keep our heap clean from // lots of short-lived objects created during the loading. System.gc();