diff --git a/Data/src/main/java/org/tribuo/data/text/FeatureTransformer.java b/Data/src/main/java/org/tribuo/data/text/FeatureTransformer.java
index db0e3d11f..ff23862cd 100644
--- a/Data/src/main/java/org/tribuo/data/text/FeatureTransformer.java
+++ b/Data/src/main/java/org/tribuo/data/text/FeatureTransformer.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -24,8 +24,11 @@
import java.util.List;
/**
- * A feature transformer maps a list of features to a new list of features
- * Useful for example to apply the hashing trick to a set of features
+ * A feature transformer maps a list of features to a new list of features.
+ * Useful for example to apply the hashing trick to a set of features.
+ *
+ * Note a list of features returned by a {@code FeatureTransformer} may contain
+ * duplicate features, and should be reduced to ensure that each feature is unique.
*/
public interface FeatureTransformer extends Configurable, Provenancable {
diff --git a/Data/src/main/java/org/tribuo/data/text/impl/FeatureHasher.java b/Data/src/main/java/org/tribuo/data/text/impl/FeatureHasher.java
index 2450e83a8..ecb956f83 100644
--- a/Data/src/main/java/org/tribuo/data/text/impl/FeatureHasher.java
+++ b/Data/src/main/java/org/tribuo/data/text/impl/FeatureHasher.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -36,15 +36,62 @@ public class FeatureHasher implements FeatureTransformer {
private static final Logger logger = Logger.getLogger(FeatureHasher.class.getName());
+ /**
+ * Default value for the hash function seed.
+ */
+ public static final int DEFAULT_HASH_SEED = 38495;
+
+ /**
+ * Default value for the value hash function seed.
+ */
+ public static final int DEFAULT_HASH_VALUE_SEED = 77777;
+
@Config(mandatory = true,description="Dimension to map the hash into.")
private int dimension;
+ @Config(description = "Seed used in the hash function.")
+ private int hashSeed = DEFAULT_HASH_SEED;
+
+ @Config(description = "Seed used for value hash function.")
+ private int hashValueSeed = DEFAULT_HASH_VALUE_SEED;
+
+ @Config(description = "Preserve input feature value.")
+ private boolean preserveValue = false;
+
/**
* Constructs a feature hasher using the supplied hash dimension.
+ *
+ * Note the hasher also hashes the feature value into {-1, 1}.
* @param dimension The dimension to reduce the hashed features into.
*/
public FeatureHasher(int dimension) {
+ this(dimension, DEFAULT_HASH_SEED, DEFAULT_HASH_VALUE_SEED, false);
+ }
+
+ /**
+ * Constructs a feature hasher using the supplied hash dimension.
+ * @param dimension The dimension to reduce the hashed features into.
+ * @param preserveValue If true the feature value is used unaltered in the new features,
+ * if false it is hashed into the values {-1, 1}.
+ */
+ public FeatureHasher(int dimension, boolean preserveValue) {
+ this(dimension, DEFAULT_HASH_SEED, DEFAULT_HASH_VALUE_SEED, preserveValue);
+ }
+
+ /**
+ * Constructs a feature hasher using the supplied hash dimension and seed values.
+ * @param dimension The dimension to reduce the hashed features into.
+ * @param hashSeed The seed used in the murmurhash computation.
+ * @param hashValueSeed The seed used in the murmurhash computation for the feature value,
+ * unused if {@code preserveValue} is true.
+ * @param preserveValue If true the feature value is used unaltered in the new features,
+ * if false it is hashed into the values {-1, 1}.
+ */
+ public FeatureHasher(int dimension, int hashSeed, int hashValueSeed, boolean preserveValue) {
this.dimension = dimension;
+ this.hashSeed = hashSeed;
+ this.hashValueSeed = hashValueSeed;
+ this.preserveValue = preserveValue;
}
/**
@@ -58,20 +105,23 @@ public List map(String tag, List features) {
List hashedFeatures = new ArrayList<>();
for (Feature feature : features) {
- int hash = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), 38495);
- //int bit = hash & 1;
- int bit = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), 77777) & 1;
+ int hash = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), hashSeed);
hash = hash >>> 1;
int code = hash % dimension;
-
- int change = bit == 1 ? 1 : -1;
- Feature newFeature = new Feature(tag + "-hash="+code,change);
+ double value;
+ if (preserveValue) {
+ value = feature.getValue();
+ } else {
+ int bit = MurmurHash3.murmurhash3_x86_32(feature.getName(), 0, feature.getName().length(), hashValueSeed) & 1;
+ value = bit == 1 ? 1 : -1;
+ }
+
+ Feature newFeature = new Feature(tag + "-hash="+code, value);
hashedFeatures.add(newFeature);
}
return hashedFeatures;
-
}
@Override
diff --git a/Data/src/main/java/org/tribuo/data/text/impl/TokenPipeline.java b/Data/src/main/java/org/tribuo/data/text/impl/TokenPipeline.java
index 6cdbb180c..663851c17 100644
--- a/Data/src/main/java/org/tribuo/data/text/impl/TokenPipeline.java
+++ b/Data/src/main/java/org/tribuo/data/text/impl/TokenPipeline.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -43,16 +43,19 @@ public class TokenPipeline implements TextPipeline {
private List transformers = new ArrayList<>();
private FeatureAggregator aggregator;
- @Config(mandatory = true,description="Use term counting, otherwise emit binary features.")
+ @Config(mandatory = true, description = "Use term counting, otherwise emit binary features.")
private boolean termCounting;
- @Config(description="Dimension to map the hash into.")
+ @Config(description = "Dimension to map the hash into.")
private int hashDim = -1;
- @Config(mandatory = true,description="Tokenizer to use.")
+ @Config(description = "Should feature hashing preserve the value?")
+ private boolean hashPreserveValue = true;
+
+ @Config(mandatory = true, description = "Tokenizer to use.")
private Tokenizer tokenizer;
- @Config(description="n in the n-gram to emit.")
+ @Config(description = "n in the n-gram to emit.")
private int ngram = 2;
/**
@@ -73,7 +76,31 @@ public class TokenPipeline implements TextPipeline {
public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting) {
this(tokenizer, ngram, termCounting, -1);
}
-
+
+ /**
+ * Creates a new token pipeline.
+ *
+ * @param tokenizer The tokenizer to use to split up the text into words
+ * (i.e., features.)
+ * @param ngram The maximum size of ngram features to add to the features
+ * generated by the pipeline. A value of {@code n} means that ngram
+ * features of size 1-n will be generated. A good standard value to use is
+ * 2, which means that unigram and bigram features will be generated. You
+ * will very likely see diminishing returns for larger values of
+ * {@code n} but there will be times when they will be necessary.
+ * @param termCounting If {@code true}, multiple occurrences of terms
+ * in the document will be counted and the count will be used as the value
+ * of the features that are produced.
+ * @param dimension The maximum dimension for the feature space. If this value
+ * is greater than 0, then at most {@code dimension} features will be
+ * through the use of a hashing function that will collapse the feature
+ * space. This {@code TokenPipeline} will preserve the feature values when hashing,
+ * w.
+ */
+ public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension) {
+ this(tokenizer, ngram, termCounting, dimension, true);
+ }
+
/**
* Creates a new token pipeline.
*
@@ -88,16 +115,19 @@ public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting) {
* @param termCounting If {@code true}, multiple occurrences of terms
* in the document will be counted and the count will be used as the value
* of the features that are produced.
- * @param dimension The maximum dimension for the feature space. If this value
+ * @param dimension The maximum dimension for the feature space. If this value
* is greater than 0, then at most {@code dimension} features will be
- * through the use of a hashing function that will collapse the feature
+ * through the use of a hashing function that will collapse the feature
* space.
+ * @param hashPreserveValue If true, the hash function preserves the feature value, if false
+ * it hashes it into the values {-1, 1}.
*/
- public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension) {
+ public TokenPipeline(Tokenizer tokenizer, int ngram, boolean termCounting, int dimension, boolean hashPreserveValue) {
this.tokenizer = tokenizer;
this.ngram = ngram;
this.hashDim = dimension;
this.termCounting = termCounting;
+ this.hashPreserveValue = hashPreserveValue;
postConfig();
}
@@ -115,7 +145,7 @@ public void postConfig() {
processors.add(new NgramProcessor(tokenizer,i,1));
}
if (hashDim > 0) {
- transformers.add(new FeatureHasher(hashDim));
+ transformers.add(new FeatureHasher(hashDim, hashPreserveValue));
}
if (termCounting) {
aggregator = new SumAggregator();
diff --git a/Data/src/test/java/org/tribuo/data/text/TextPipelineTest.java b/Data/src/test/java/org/tribuo/data/text/TextPipelineTest.java
index 954204e23..c15a52df2 100644
--- a/Data/src/test/java/org/tribuo/data/text/TextPipelineTest.java
+++ b/Data/src/test/java/org/tribuo/data/text/TextPipelineTest.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -101,6 +101,34 @@ public void testTokenPipeline() {
assertTrue(featureList.contains(new Feature("2-N=input/text",1.0)));
}
+ @Test
+ public void testHashingTokenPipeline() {
+ String input = "This is some input text.";
+
+ TokenPipeline pipeline = new TokenPipeline(new BreakIteratorTokenizer(Locale.US),2,true, 10);
+
+ List featureList = pipeline.process("test",input);
+
+ assertTrue(featureList.contains(new Feature("test-hash=1",1.0)));
+ assertTrue(featureList.contains(new Feature("test-hash=2",2.0)));
+ assertTrue(featureList.contains(new Feature("test-hash=3",5.0)));
+ assertTrue(featureList.contains(new Feature("test-hash=5",1.0)));
+ assertTrue(featureList.contains(new Feature("test-hash=6",1.0)));
+ assertTrue(featureList.contains(new Feature("test-hash=7",1.0)));
+
+ TokenPipeline hashedValuePipeline = new TokenPipeline(new BreakIteratorTokenizer(Locale.US),2,true, 10, false);
+
+ List hashedValueFeatureList = hashedValuePipeline.process("test",input);
+
+ assertTrue(hashedValueFeatureList.contains(new Feature("test-hash=1",1.0)));
+ assertTrue(hashedValueFeatureList.contains(new Feature("test-hash=2",0.0)));
+ assertTrue(hashedValueFeatureList.contains(new Feature("test-hash=3",-1.0)));
+ assertTrue(hashedValueFeatureList.contains(new Feature("test-hash=5",-1.0)));
+ assertTrue(hashedValueFeatureList.contains(new Feature("test-hash=6",1.0)));
+ assertTrue(hashedValueFeatureList.contains(new Feature("test-hash=7",-1.0)));
+ }
+
+
@Test
public void testTokenPipelineTagging() {
String input = "This is some input text.";
diff --git a/Data/src/test/java/org/tribuo/data/text/impl/FeatureHasherTest.java b/Data/src/test/java/org/tribuo/data/text/impl/FeatureHasherTest.java
new file mode 100644
index 000000000..78c92747e
--- /dev/null
+++ b/Data/src/test/java/org/tribuo/data/text/impl/FeatureHasherTest.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tribuo.data.text.impl;
+
+import org.junit.jupiter.api.Test;
+import org.tribuo.Feature;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class FeatureHasherTest {
+
+ @Test
+ public void negativeValuesTest() {
+ List input = new ArrayList<>();
+ Feature posValue = new Feature("Testing", 2.0);
+ input.add(posValue);
+ Feature negValue = new Feature("Test",2.0);
+ input.add(negValue);
+
+ FeatureHasher preserving = new FeatureHasher(10, true);
+ FeatureHasher notPreserving = new FeatureHasher(10, false);
+
+ List preservingOutput = preserving.map("test", input);
+ List notPreservingOutput = notPreserving.map("test", input);
+
+ assertEquals(2.0, preservingOutput.get(0).getValue());
+ assertEquals(2.0, preservingOutput.get(1).getValue());
+
+ assertEquals(1.0, notPreservingOutput.get(0).getValue());
+ assertEquals(-1.0, notPreservingOutput.get(1).getValue());
+ }
+
+}