Skip to content

Commit

Permalink
Added NEAMT-based annotators.
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelRoeder committed Aug 23, 2024
1 parent f8cc10c commit a63a8e3
Show file tree
Hide file tree
Showing 4 changed files with 264 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package org.aksw.gerbil.annotator.impl.neamt;

import java.io.IOException;
import java.nio.charset.StandardCharsets;

import org.aksw.gerbil.annotator.http.AbstractHttpBasedAnnotator;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.aksw.gerbil.transfer.nif.data.SpanImpl;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

/**
* Abstract annotator class for annotation systems that are hosted by <a href=
* "https://github.com/dice-group/LFQA/tree/main/naive-eamt#na%C3%AFve-eamt-na%C3%AFve-entity-aware-machine-translation-framework">NEAMT</a>.
*
* @author Michael R&ouml;der (michael.roeder@uni-paderborn.de)
*
*/
public abstract class AbstractNeamtAnnotator extends AbstractHttpBasedAnnotator {

private static final Logger LOGGER = LoggerFactory.getLogger(AbstractNeamtAnnotator.class);

private static final String MEDIA_TYPE_STRING = ContentType.create("application/json", StandardCharsets.UTF_8)
.toString();

/**
* Service URL.
*/
protected String serviceUrl;
/**
* Component name as defined in the NEAMT service documentation.
*/
protected String components;
/**
* Language tag.
*/
protected String lang;

public AbstractNeamtAnnotator(String serviceUrl, String components, String lang) {
super();
this.serviceUrl = serviceUrl;
this.components = components;
this.lang = lang;
}

protected Document request(Document document) throws GerbilException {
String text = document.getText();
String documentUri = document.getDocumentURI();
LOGGER.info("Started request for {}", documentUri);
HttpPost request = null;
try {
request = createPostRequest(serviceUrl);
} catch (Exception e) {
throw new GerbilException("Couldn't create HTTP request.", e, ErrorTypes.UNEXPECTED_EXCEPTION);
}

JsonObject requestBody = createRequestBody(document);
request.setEntity(new StringEntity(requestBody.toString(), StandardCharsets.UTF_8));

request.addHeader(HttpHeaders.CONTENT_TYPE, MEDIA_TYPE_STRING);
request.addHeader(HttpHeaders.ACCEPT, MEDIA_TYPE_STRING);

HttpEntity entity = null;
CloseableHttpResponse response = null;
Document resultDoc = null;
try {
response = sendRequest(request);
entity = response.getEntity();
try {
resultDoc = new DocumentImpl(text, documentUri);
String content = IOUtils.toString(entity.getContent());
JsonObject outJson = new JsonParser().parse(content).getAsJsonObject();
parseMarkings(outJson, resultDoc);
} catch (Exception e) {
LOGGER.error("Couldn't parse the response.", e);
throw new GerbilException("Couldn't parse the response.", e, ErrorTypes.UNEXPECTED_EXCEPTION);
}
} finally {
closeRequest(request);
if (entity != null) {
try {
EntityUtils.consume(entity);
} catch (IOException e1) {
}
}
IOUtils.closeQuietly(response);
}
LOGGER.info("Finished request for {}", resultDoc.getDocumentURI());
return resultDoc;
}

protected JsonObject createRequestBody(Document document) {
JsonObject requestBody = new JsonObject();
requestBody.addProperty("query", document.getText());
requestBody.addProperty("components", components);
requestBody.addProperty("full_json", true);
requestBody.addProperty("lang", lang);
return requestBody;
}

protected void parseMarkings(JsonObject outJson, Document resultDoc) {
if (outJson.has("ent_mentions")) {
JsonElement element = outJson.get("ent_mentions");
if (element.isJsonArray()) {
JsonArray mentions = element.getAsJsonArray();
mentions.forEach(m -> parseMarking(m, resultDoc));
return;
}
}
LOGGER.warn("Couldn't find any mentions in the result \"{}\". It will be ignored.", outJson.toString());
}

protected void parseMarking(JsonElement mentionElement, Document resultDoc) {
if (mentionElement.isJsonObject()) {
JsonObject mentionObj = mentionElement.getAsJsonObject();
// The marking should have start and end
if (mentionObj.has("start") && mentionObj.has("end")) {
int start = mentionObj.get("start").getAsInt();
int end = mentionObj.get("end").getAsInt();
String iri = null;
// It may have a link
if (mentionObj.has("link")) {
iri = mentionObj.get("link").getAsString();
if (iri.isEmpty()) {
iri = null;
} else {
// It is just the Wikidata ID, so we have to add the namespace
iri = "http://www.wikidata.org/entity/" + iri;
}
}
// If we have found no IRI, we have a Span, otherwise a NamedEntity
if (iri == null) {
resultDoc.addMarking(new SpanImpl(start, end - start));
} else {
resultDoc.addMarking(new NamedEntity(start, end - start, iri));
}
return; // We can return without problems
}
}
// Something went wrong
LOGGER.warn("Couldn't parse mention \"{}\". It will be ignored.", mentionElement.toString());
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.aksw.gerbil.annotator.impl.neamt;

import java.util.List;

import org.aksw.gerbil.annotator.D2KBAnnotator;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.MeaningSpan;
import org.aksw.gerbil.transfer.nif.Span;

import com.google.gson.JsonArray;
import com.google.gson.JsonObject;

public class NeamtD2KBAnnotator extends AbstractNeamtAnnotator implements D2KBAnnotator {

public NeamtD2KBAnnotator(String serviceUrl, String components, String lang) {
super(serviceUrl, components, lang);
}

@Override
public List<MeaningSpan> performD2KBTask(Document document) throws GerbilException {
return request(document).getMarkings(MeaningSpan.class);
}

@Override
protected JsonObject createRequestBody(Document document) {
// Add the entity mentions to the request
String text = document.getText();
JsonObject requestBody = super.createRequestBody(document);
JsonArray mentions = new JsonArray();
int start;
int end;
for (Span span : document.getMarkings(Span.class)) {
start = span.getStartPosition();
end = start + span.getLength();
JsonObject mention = new JsonObject();
mention.addProperty("start", start);
mention.addProperty("end", end);
mention.addProperty("surface_form", text.substring(start, end));
mentions.add(mention);
}
requestBody.add("ent_mentions", mentions);
return requestBody;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package org.aksw.gerbil.annotator.impl.neamt;

import java.util.List;

import org.aksw.gerbil.annotator.EntityRecognizer;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Span;

public class NeamtEntityRecognizer extends AbstractNeamtAnnotator implements EntityRecognizer {

public NeamtEntityRecognizer(String serviceUrl, String components, String lang) {
super(serviceUrl, components, lang);
}

@Override
public List<Span> performRecognition(Document document) throws GerbilException {
return request(document).getMarkings(Span.class);
}

}
38 changes: 37 additions & 1 deletion src/main/properties/annotators.properties
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,20 @@ org.aksw.gerbil.annotators.definition.cetus2.cacheable=true
org.aksw.gerbil.annotators.definition.cetus2.class=org.aksw.gerbil.annotator.impl.nif.NIFBasedAnnotatorWebservice
org.aksw.gerbil.annotators.definition.cetus2.constructorArgs=${org.aksw.gerbil.annotator.cetus2.ServieURL}

### Davlan
org.aksw.gerbil.annotators.definition.Davlan.name=Davlan (NEAMT)
org.aksw.gerbil.annotators.definition.Davlan.experimentType=ERec
org.aksw.gerbil.annotators.definition.Davlan.cacheable=true
org.aksw.gerbil.annotators.definition.Davlan.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer
org.aksw.gerbil.annotators.definition.Davlan.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, davlan_ner, en

### DBpedia Spotlight
org.aksw.gerbil.annotators.definition.spotlight.name=DBpedia Spotlight
org.aksw.gerbil.annotators.definition.spotlight.experimentType=OKE_Task1
org.aksw.gerbil.annotators.definition.spotlight.cacheable=true
org.aksw.gerbil.annotators.definition.spotlight.class=org.aksw.gerbil.annotator.impl.spotlight.SpotlightAnnotator
org.aksw.gerbil.annotator.impl.spotlight.SpotlightAnnotator.ServieURL=https://api.dbpedia-spotlight.org/en/


### Dexter
org.aksw.gerbil.annotators.DexterAnnotator.annotationUrl=http://dexterdemo.isti.cnr.it:8080/dexter-webapp/api/nif/annotate
org.aksw.gerbil.annotators.definition.Dexter.name=Dexter
Expand Down Expand Up @@ -94,6 +100,13 @@ org.aksw.gerbil.annotators.definition.FALCON.cacheable=true
org.aksw.gerbil.annotators.definition.FALCON.class=org.aksw.gerbil.annotator.impl.falcon.FALCONAnnotator
org.aksw.gerbil.annotators.definition.FALCON.constructorArgs=${org.aksw.gerbil.annotator.FALCON.ServieURL}

### FLAIR
org.aksw.gerbil.annotators.definition.FLAIR.name=Flair (NEAMT)
org.aksw.gerbil.annotators.definition.FLAIR.experimentType=ERec
org.aksw.gerbil.annotators.definition.FLAIR.cacheable=true
org.aksw.gerbil.annotators.definition.FLAIR.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer
org.aksw.gerbil.annotators.definition.FLAIR.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, flair_ner, en

### FRED
org.aksw.gerbil.annotators.FredAnnotator.serviceUrl=http://wit.istc.cnr.it/stlab-tools/fred
org.aksw.gerbil.annotators.definition.fred.name=FRED
Expand Down Expand Up @@ -168,6 +181,15 @@ org.aksw.gerbil.annotators.definition.kea2.check.args=org.aksw.gerbil.annotators
org.aksw.gerbil.annotators.definition.kea2.check.args=org.aksw.gerbil.annotators.KeaAnnotatorConfig.password
org.aksw.gerbil.annotators.definition.kea2.constructorArgs=http://${org.aksw.gerbil.annotators.KeaAnnotatorConfig.user}:${org.aksw.gerbil.annotators.KeaAnnotatorConfig.password}@${org.aksw.gerbil.annotators.KeaAnnotatorConfig.disambiguationUrl}

### mGENRE
org.aksw.gerbil.annotators.definition.mGENRE.name=mGENRE (NEAMT)
org.aksw.gerbil.annotators.definition.mGENRE.experimentType=D2KB
org.aksw.gerbil.annotators.definition.mGENRE.cacheable=true
org.aksw.gerbil.annotators.definition.mGENRE.class=org.aksw.gerbil.annotator.impl.neamt.NeamtD2KBAnnotator
org.aksw.gerbil.annotators.definition.mGENRE.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, mgenre_el, en

### NEAMT
org.aksw.gerbil.annotators.NEAMT.url=http://porque.cs.upb.de/porque-neamt/custom-pipeline

### NERD-ML
#NERD endpoint
Expand Down Expand Up @@ -214,6 +236,13 @@ org.aksw.gerbil.annotators.definition.REL.cacheable=true
org.aksw.gerbil.annotators.definition.REL.class=org.aksw.gerbil.annotator.impl.rel.RELAnnotator
org.aksw.gerbil.annotators.definition.REL.constructorArgs=${org.aksw.gerbil.annotator.REL.ServieURL}

### Spacy
org.aksw.gerbil.annotators.definition.Spacy.name=Spacy (NEAMT)
org.aksw.gerbil.annotators.definition.Spacy.experimentType=ERec
org.aksw.gerbil.annotators.definition.Spacy.cacheable=true
org.aksw.gerbil.annotators.definition.Spacy.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer
org.aksw.gerbil.annotators.definition.Spacy.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, spacy_ner, en

### Tagme
org.aksw.gerbil.annotators.TagmeAnnotator.annotateUrl=https://tagme.d4science.org/tagme/tag
org.aksw.gerbil.annotators.TagmeAnnotator.spotUrl=https://tagme.d4science.org/tagme/spot
Expand Down Expand Up @@ -266,3 +295,10 @@ org.aksw.gerbil.annotators.definition.XLisa2.kb=dbpedia
org.aksw.gerbil.annotators.definition.XLisa2.model=NER
org.aksw.gerbil.annotators.definition.XLisa2.constructorArgs=${org.aksw.gerbil.annotators.definition.XLisa2.lang1}, ${org.aksw.gerbil.annotators.definition.XLisa2.lang2}, ${org.aksw.gerbil.annotators.definition.XLisa2.kb}, ${org.aksw.gerbil.annotators.definition.XLisa2.model}

### WikiNEuRal
org.aksw.gerbil.annotators.definition.WikiNEuRal.name=WikiNEuRal (NEAMT)
org.aksw.gerbil.annotators.definition.WikiNEuRal.experimentType=ERec
org.aksw.gerbil.annotators.definition.WikiNEuRal.cacheable=true
org.aksw.gerbil.annotators.definition.WikiNEuRal.class=org.aksw.gerbil.annotator.impl.neamt.NeamtEntityRecognizer
org.aksw.gerbil.annotators.definition.WikiNEuRal.constructorArgs=${org.aksw.gerbil.annotators.NEAMT.url}, babelscape_ner, en

0 comments on commit a63a8e3

Please sign in to comment.