Skip to content

Commit

Permalink
Skip intermediate merging for HNSW indexing when optimizing (#2272)
Browse files Browse the repository at this point in the history
+ if we're going to optimize down to a single segment at the end, intermediate merging is a waste.
+ bumped up size of default memory buffer to 64G.
  • Loading branch information
lintool authored Nov 25, 2023
1 parent 227d93a commit 95a546c
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions src/main/java/io/anserini/index/IndexHnswDenseVectors.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.kohsuke.args4j.CmdLineException;
Expand Down Expand Up @@ -82,7 +83,7 @@ public static final class Args {
public boolean optimize = false;

@Option(name = "-memorybuffer", metaVar = "[mb]", usage = "Memory buffer size in MB.")
public int memorybufferSize = 4096;
public int memorybufferSize = 65536;

@Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.")
public boolean storeVectors = false;
Expand All @@ -95,7 +96,6 @@ public static final class Args {

@Option(name = "-quiet", forbids = {"-verbose"}, usage = "Turns off all logging.")
public boolean quiet = false;

}

private final class LocalIndexerThread extends Thread {
Expand Down Expand Up @@ -247,7 +247,7 @@ public IndexHnswDenseVectors(Args args) throws Exception {

// Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html
// This class exists because Lucene95HnswVectorsFormat's getMaxDimensions method is final and we
// need to workaround that constraint to allow more than the default number of dimensions
// need to workaround that constraint to allow more than the default number of dimensions.
private static final class OpenAiDelegatingKnnVectorsFormat extends KnnVectorsFormat {
private final KnnVectorsFormat delegate;
private final int maxDimensions;
Expand Down Expand Up @@ -287,10 +287,21 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
new Lucene95HnswVectorsFormat(args.M, args.efC), 4096);
}
});

config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(args.memorybufferSize);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());

if (args.optimize) {
// If we're going to merge down into a single segment at the end, skip intermediate merges,
// since they are a waste of time.
TieredMergePolicy mergePolicy = new TieredMergePolicy();
mergePolicy.setMaxMergeAtOnce(256);
mergePolicy.setSegmentsPerTier(256);
config.setMergePolicy(mergePolicy);
}

IndexWriter writer = new IndexWriter(dir, config);

final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(args.threads);
Expand Down

0 comments on commit 95a546c

Please sign in to comment.