Skip to content

Commit

Permalink
New SDK changes applied to the project [ElasticSearch 1.7.5]
Browse files Browse the repository at this point in the history
FIX: added UTF-8 specifiaction to the reader to prevent fails on non-Unicode systems;
CHG: moved onto ES SDK v1.7.5;
ADD: added some little logging;
  • Loading branch information
mrgambal committed Feb 17, 2016
1 parent 896a206 commit d40e240
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 24 deletions.
48 changes: 28 additions & 20 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@ apply plugin: 'groovy'
apply plugin: 'maven'

group = 'org.sotnya'
version = '1.0.1'
version = '1.0.2'
sourceCompatibility = "1.8"
targetCompatibility = "1.8"


description = """Ukrainian lemmatizer plugin for ElasticSearch"""

repositories {
Expand All @@ -19,14 +18,14 @@ repositories {
}

dependencies {
compile 'org.elasticsearch:elasticsearch:1.7.4'
compile 'junit:junit:4.12'

testCompile 'org.apache.lucene:lucene-test-framework:4.10.4'
testCompile 'org.elasticsearch:elasticsearch:1.7.4:tests'
testCompile 'com.google.guava:guava:18.0'
testCompile 'com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.14'
testCompile 'org.hamcrest:hamcrest-all:1.3'
compile group: 'org.elasticsearch', name: 'elasticsearch', version:'1.7.5'

testCompile group: 'junit', name: 'junit', version: '4.12'
testCompile group: 'org.apache.lucene', name: 'lucene-test-framework', version: '4.10.4'
testCompile group: 'org.elasticsearch', name: 'elasticsearch', version: '1.7.5', classifier: 'tests'
testCompile group: 'com.google.guava', name: 'guava', version: '19.0'
testCompile group: 'com.carrotsearch.randomizedtesting', name: 'randomizedtesting-runner', version: '2.1.14'
testCompile group: 'org.hamcrest', name: 'hamcrest-library', version: '1.3'
}

File explodedDistDir = new File(distsDir, 'exploded')
Expand All @@ -41,21 +40,30 @@ task explodedDist(dependsOn: [jar], description: 'Builds the plugin zip file') <
into explodedDistDir
}

ant.delete {
fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar")
}
ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "asm*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "antlr*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "compress*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "commons-cli*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "elasticsearch-*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "guava*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "hamcrest*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "HdrHistogram*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "hppc*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "jackson*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "joda*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "jsr*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "junit*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "lucene-*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "netty-*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "snakeyaml*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "spatial4j-*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "t-digest*.jar") }

copy {
from libsDir
into explodedDistDir
}

ant.delete { fileset(dir: explodedDistDir, includes: "lucene-*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "spatial4j-*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "hamcrest*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "junit*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "asm*.jar") }
ant.delete { fileset(dir: explodedDistDir, includes: "antlr*.jar") }
}

task zip(type: Zip, dependsOn: ['explodedDist']) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.ESLoggerFactory;
import org.sotnya.lemmatizer.uk.engine.UkrainianLemmatizer;

public class UkrainianLemmatizerTokenFilter extends TokenFilter {
private UkrainianLemmatizer lemmatizer = null;
private final ESLogger logger = ESLoggerFactory.getLogger(this.getClass().getSimpleName());
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

Expand All @@ -30,12 +33,16 @@ public final boolean incrementToken() throws IOException {
return false;
}

logger.debug(String.format("Looking for term %s.", termAtt));

Optional<CharSequence> lemma = lemmatizer.lemmatize(termAtt);

if (lemma.isPresent()) {
if(!keywordAttr.isKeyword() && !equalCharSequences(lemma.get(), termAtt)) {
termAtt.setEmpty().append(lemma.get());
}

logger.debug(String.format("Found lemma %s", lemma.get()));
}

return true;
Expand All @@ -46,9 +53,8 @@ public final boolean incrementToken() throws IOException {
*/
private boolean equalCharSequences(CharSequence s1, CharSequence s2) {
int len1 = s1.length();
int len2 = s2.length();

if (len1 != len2) return false;
if (len1 != s2.length()) return false;

for (int i = len1; --i >= 0; ) {
if (s1.charAt(i) != s2.charAt(i)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.sotnya.lemmatizer.uk.engine;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.ESLoggerFactory;

import java.io.*;
import java.util.HashMap;
Expand All @@ -13,6 +15,7 @@
* Handles mapping retrieval, terms normalisation and lookup for proper lemmas in mapping.
*/
public class UkrainianLemmatizer {
private static final ESLogger logger = ESLoggerFactory.getLogger(UkrainianLemmatizer.class.getSimpleName());
private static final Map<String, String> dictionary;
/**
* Before the lookup we replace some symbols to their alternatives are being used in mapping.
Expand All @@ -25,17 +28,21 @@ public class UkrainianLemmatizer {

static {
// load mapping from file (must be changed to faster and memory-efficient type)
final InputStream is = UkrainianLemmatizer.class.getClassLoader().getResourceAsStream("mapping_sorted.csv");
final String fileName = "mapping_sorted.csv";
final String separator = ",";
final ClassLoader loader = UkrainianLemmatizer.class.getClassLoader();

try (BufferedReader reader = new BufferedReader(new InputStreamReader(is))) {
logger.debug("Started loading dictionary");

try (BufferedReader reader = new BufferedReader(new InputStreamReader(loader.getResourceAsStream(fileName), "UTF8"))) {
dictionary = reader.lines()
.map(line -> line.split(separator))
.collect(Collectors.toMap(p -> p[0], p -> p[1]));
} catch (IOException e) {
throw new UncheckedIOException(e);
}

logger.debug(String.format("Finished loading dictionary. %d entities delivered gently.", dictionary.size()));
// Let's wait some additional second but we'll keep our heap clean from
// lots of short-lived objects created during the loading.
System.gc();
Expand Down

0 comments on commit d40e240

Please sign in to comment.