From 0f434832369668cef2fee80489f707c965d6ab2d Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Mon, 30 Sep 2024 13:43:56 -0700
Subject: [PATCH 1/6] demo: Implement manual boundary locking for clusters

While using automatic border locking usually works very well, it
prevents simplifying open geometric borders that the source mesh might
contain. When a mesh has many edges like this, this negatively affects
the DAG quality by locking edges contained within one cluster group.

Instead, we can simply mark vertices that are shared by more than one
group after a partition and feed that data to simplifier.

The effect of this is situational; it's beneficial overall but because
the rest of the pipeline has odd issues with partitioning, even when
using METIS for everything, sometimes this ends up constraining the
build process a little more for unclear reasons. Theoretically this
should be strictly better though, and on some meshes like kitten.obj it
does measurably help, however it needs fixes for attribute seams to
keep it enabled.
---
 demo/nanite.cpp | 41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)
diff --git a/demo/nanite.cpp b/demo/nanite.cpp
index a52a61a39..45e7ca694 100644
--- a/demo/nanite.cpp
+++ b/demo/nanite.cpp
@@ -15,6 +15,7 @@
 #include <float.h>
 #include <math.h>
 #include <stdio.h>
+#include <string.h>
 
 #include <map>
 #include <vector>
@@ -50,6 +51,7 @@ struct Cluster
 };
 
 const size_t kClusterSize = 128;
+const bool kUseLocks = false;
 
 static LODBounds bounds(const std::vector<Vertex>& vertices, const std::vector<unsigned int>& indices, float error)
 {
@@ -386,15 +388,40 @@ static std::vector<std::vector<int> > partition(const std::vector<Cluster>& clus
 	return result;
 }
 
-static std::vector<unsigned int> simplify(const std::vector<Vertex>& vertices, const std::vector<unsigned int>& indices, size_t target_count, float* error = NULL)
+static void lockBoundary(std::vector<unsigned char>& locks, const std::vector<std::vector<int> >& groups, const std::vector<Cluster>& clusters)
+{
+	std::vector<int> groupmap(locks.size(), -1);
+
+	memset(&locks[0], 0, locks.size());
+
+	for (size_t i = 0; i < groups.size(); ++i)
+		for (size_t j = 0; j < groups[i].size(); ++j)
+		{
+			const Cluster& cluster = clusters[groups[i][j]];
+
+			for (size_t k = 0; k < cluster.indices.size(); ++k)
+			{
+				unsigned int v = cluster.indices[k];
+
+				if (groupmap[v] == -1 || groupmap[v] == int(i))
+					groupmap[v] = int(i);
+				else
+					locks[v] = 1;
+			}
+		}
+}
+
+static std::vector<unsigned int> simplify(const std::vector<Vertex>& vertices, const std::vector<unsigned int>& indices, const std::vector<unsigned char>* locks, size_t target_count, float* error = NULL)
 {
 	if (target_count > indices.size())
 		return indices;
 
 	std::vector<unsigned int> lod(indices.size());
-	unsigned int options = meshopt_SimplifyLockBorder | meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute;
-	lod.resize(meshopt_simplify(&lod[0], &indices[0], indices.size(), &vertices[0].px, vertices.size(), sizeof(Vertex), target_count, FLT_MAX, options, error));
-
+	unsigned int options = meshopt_SimplifySparse | meshopt_SimplifyErrorAbsolute;
+	if (locks)
+		lod.resize(meshopt_simplifyWithAttributes(&lod[0], &indices[0], indices.size(), &vertices[0].px, vertices.size(), sizeof(Vertex), NULL, 0, NULL, 0, &(*locks)[0], target_count, FLT_MAX, options, error));
+	else
+		lod.resize(meshopt_simplify(&lod[0], &indices[0], indices.size(), &vertices[0].px, vertices.size(), sizeof(Vertex), target_count, FLT_MAX, options | meshopt_SimplifyLockBorder, error));
 	return lod;
 }
 
@@ -431,6 +458,7 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 #endif
 
 	int depth = 0;
+	std::vector<unsigned char> locks(vertices.size());
 
 	// merge and simplify clusters until we can't merge anymore
 	while (pending.size() > 1)
@@ -449,6 +477,9 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 		if (dump && depth == atoi(dump))
 			dumpObj(vertices, std::vector<unsigned int>());
 
+		if (kUseLocks)
+			lockBoundary(locks, groups, clusters);
+
 		// every group needs to be simplified now
 		for (size_t i = 0; i < groups.size(); ++i)
 		{
@@ -479,7 +510,7 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 				dumpObj("group", merged);
 
 			float error = 0.f;
-			std::vector<unsigned int> simplified = simplify(vertices, merged, kClusterSize * 2 * 3, &error);
+			std::vector<unsigned int> simplified = simplify(vertices, merged, kUseLocks ? &locks : NULL, kClusterSize * 2 * 3, &error);
 			if (simplified.size() > merged.size() * 0.85f || simplified.size() > kClusterSize * 3 * 3)
 			{
 #if TRACE

From 43dac0d016c74e524a7184d3c797d96fdd7e8a7b Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Tue, 1 Oct 2024 18:07:28 -0700
Subject: [PATCH 2/6] demo: Add total triangle/cluster counter

This is helpful to estimate the memory impact of the full LOD chain and
makes it easier to compare stats with UE which reports them in log file.
---
 demo/nanite.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/demo/nanite.cpp b/demo/nanite.cpp
index 45e7ca694..b238195ad 100644
--- a/demo/nanite.cpp
+++ b/demo/nanite.cpp
@@ -567,11 +567,16 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 		pending.insert(pending.end(), retry.begin(), retry.end());
 	}
 
+	size_t total_triangles = 0;
 	size_t lowest_triangles = 0;
 	for (size_t i = 0; i < clusters.size(); ++i)
+	{
+		total_triangles += clusters[i].indices.size() / 3;
 		if (clusters[i].parent.error == FLT_MAX)
 			lowest_triangles += clusters[i].indices.size() / 3;
+	}
 
+	printf("total: %d triangles in %d clusters\n", int(total_triangles), int(clusters.size()));
 	printf("lowest lod: %d triangles\n", int(lowest_triangles));
 
 	// for testing purposes, we can compute a DAG cut from a given viewpoint and dump it as an OBJ

From a7f7198b72ccbae08624e0eb62add5d6e7272d9d Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Tue, 1 Oct 2024 18:35:37 -0700
Subject: [PATCH 3/6] demo: Add individual clusters to debug mesh

It is pretty difficult to debug the visualized data with just groups
without a sense as to where the cluster boundaries lie; this outputs
duplicate geometry but makes it easier to work with in Blender.
---
 demo/nanite.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/demo/nanite.cpp b/demo/nanite.cpp
index b238195ad..f05b9304f 100644
--- a/demo/nanite.cpp
+++ b/demo/nanite.cpp
@@ -507,7 +507,12 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 				merged.insert(merged.end(), clusters[groups[i][j]].indices.begin(), clusters[groups[i][j]].indices.end());
 
 			if (dump && depth == atoi(dump))
+			{
+				for (size_t j = 0; j < groups[i].size(); ++j)
+					dumpObj("cluster", clusters[groups[i][j]].indices);
+
 				dumpObj("group", merged);
+			}
 
 			float error = 0.f;
 			std::vector<unsigned int> simplified = simplify(vertices, merged, kUseLocks ? &locks : NULL, kClusterSize * 2 * 3, &error);

From b35474e3298e654130568d50d29c1fbee52e5074 Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Tue, 1 Oct 2024 18:51:20 -0700
Subject: [PATCH 4/6] demo: Use position remap for locking and partitioning

Previously when we built adjacency between groups we would not create
edges between clusters when they shared the position of the vertex with
a different attribute. This meant that when the cluster boundary
contained a crease, it would not be reflected correctly in the
adjacency.

Similarly, but more importantly, we need this information when locking
the cluster boundary for the same reason; before, lockBoundary could
result in gaps because the locking information on the boundary was
inconsistent. This allows us to enable kUseLocks by default.
---
 demo/nanite.cpp | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/demo/nanite.cpp b/demo/nanite.cpp
index f05b9304f..6ec748104 100644
--- a/demo/nanite.cpp
+++ b/demo/nanite.cpp
@@ -51,7 +51,7 @@ struct Cluster
 };
 
 const size_t kClusterSize = 128;
-const bool kUseLocks = false;
+const bool kUseLocks = true;
 
 static LODBounds bounds(const std::vector<Vertex>& vertices, const std::vector<unsigned int>& indices, float error)
 {
@@ -276,7 +276,7 @@ static std::vector<Cluster> clusterize(const std::vector<Vertex>& vertices, cons
 }
 
 #ifdef METIS
-static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>& clusters, const std::vector<int>& pending)
+static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>& clusters, const std::vector<int>& pending, const std::vector<unsigned int>& remap)
 {
 	std::vector<std::vector<int> > result;
 
@@ -288,8 +288,8 @@ static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>&
 
 		for (size_t j = 0; j < cluster.indices.size(); ++j)
 		{
-			int v0 = cluster.indices[j + 0];
-			int v1 = cluster.indices[j + (j % 3 == 2 ? -2 : 1)];
+			int v0 = remap[cluster.indices[j + 0]];
+			int v1 = remap[cluster.indices[j + (j % 3 == 2 ? -2 : 1)]];
 
 			std::vector<int>& list = edges[std::make_pair(std::min(v0, v1), std::max(v0, v1))];
 			if (list.empty() || list.back() != int(i))
@@ -360,14 +360,16 @@ static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>&
 }
 #endif
 
-static std::vector<std::vector<int> > partition(const std::vector<Cluster>& clusters, const std::vector<int>& pending)
+static std::vector<std::vector<int> > partition(const std::vector<Cluster>& clusters, const std::vector<int>& pending, const std::vector<unsigned int>& remap)
 {
 #ifdef METIS
 	static const char* metis = getenv("METIS");
 	if (metis && atoi(metis) >= 1)
-		return partitionMetis(clusters, pending);
+		return partitionMetis(clusters, pending, remap);
 #endif
 
+	(void)remap;
+
 	std::vector<std::vector<int> > result;
 
 	size_t last_indices = 0;
@@ -388,12 +390,10 @@ static std::vector<std::vector<int> > partition(const std::vector<Cluster>& clus
 	return result;
 }
 
-static void lockBoundary(std::vector<unsigned char>& locks, const std::vector<std::vector<int> >& groups, const std::vector<Cluster>& clusters)
+static void lockBoundary(std::vector<unsigned char>& locks, const std::vector<std::vector<int> >& groups, const std::vector<Cluster>& clusters, const std::vector<unsigned int>& remap)
 {
 	std::vector<int> groupmap(locks.size(), -1);
 
-	memset(&locks[0], 0, locks.size());
-
 	for (size_t i = 0; i < groups.size(); ++i)
 		for (size_t j = 0; j < groups[i].size(); ++j)
 		{
@@ -402,13 +402,22 @@ static void lockBoundary(std::vector<unsigned char>& locks, const std::vector<st
 			for (size_t k = 0; k < cluster.indices.size(); ++k)
 			{
 				unsigned int v = cluster.indices[k];
+				unsigned int r = remap[v];
 
-				if (groupmap[v] == -1 || groupmap[v] == int(i))
-					groupmap[v] = int(i);
+				if (groupmap[r] == -1 || groupmap[r] == int(i))
+					groupmap[r] = int(i);
 				else
-					locks[v] = 1;
+					groupmap[r] = -2;
 			}
 		}
+
+	// note: we need to consistently lock all vertices with the same position to avoid holes
+	for (size_t i = 0; i < locks.size(); ++i)
+	{
+		unsigned int r = remap[i];
+
+		locks[i] = (groupmap[r] == -2);
+	}
 }
 
 static std::vector<unsigned int> simplify(const std::vector<Vertex>& vertices, const std::vector<unsigned int>& indices, const std::vector<unsigned char>* locks, size_t target_count, float* error = NULL)
@@ -460,10 +469,16 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 	int depth = 0;
 	std::vector<unsigned char> locks(vertices.size());
 
+	// for cluster connectivity, we need a position-only remap that maps vertices with the same position to the same index
+	// it's more efficient to build it once; unfortunately, meshopt_generateVertexRemap doesn't support stride so we need to use *Multi version
+	std::vector<unsigned int> remap(vertices.size());
+	meshopt_Stream position = {&vertices[0].px, sizeof(float) * 3, sizeof(Vertex)};
+	meshopt_generateVertexRemapMulti(&remap[0], &indices[0], indices.size(), vertices.size(), &position, 1);
+
 	// merge and simplify clusters until we can't merge anymore
 	while (pending.size() > 1)
 	{
-		std::vector<std::vector<int> > groups = partition(clusters, pending);
+		std::vector<std::vector<int> > groups = partition(clusters, pending, remap);
 		pending.clear();
 
 		std::vector<int> retry;
@@ -478,7 +493,7 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 			dumpObj(vertices, std::vector<unsigned int>());
 
 		if (kUseLocks)
-			lockBoundary(locks, groups, clusters);
+			lockBoundary(locks, groups, clusters, remap);
 
 		// every group needs to be simplified now
 		for (size_t i = 0; i < groups.size(); ++i)

From 8a0beac45266571a450935b1b669332b19671efb Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Tue, 1 Oct 2024 19:03:24 -0700
Subject: [PATCH 5/6] demo: Use vertex adjacency for partitioning

Instead of using shared edges to provide adjacency, we now use shared
vertices; this works more or less the same way but is easier to compute.
For some reason this also results in fewer stuck triangles when using
full METIS pipeline.
---
 demo/nanite.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/demo/nanite.cpp b/demo/nanite.cpp
index 6ec748104..13f7c2167 100644
--- a/demo/nanite.cpp
+++ b/demo/nanite.cpp
@@ -279,8 +279,7 @@ static std::vector<Cluster> clusterize(const std::vector<Vertex>& vertices, cons
 static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>& clusters, const std::vector<int>& pending, const std::vector<unsigned int>& remap)
 {
 	std::vector<std::vector<int> > result;
-
-	std::map<std::pair<int, int>, std::vector<int> > edges;
+	std::vector<std::vector<int> > vertices(remap.size());
 
 	for (size_t i = 0; i < pending.size(); ++i)
 	{
@@ -288,10 +287,9 @@ static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>&
 
 		for (size_t j = 0; j < cluster.indices.size(); ++j)
 		{
-			int v0 = remap[cluster.indices[j + 0]];
-			int v1 = remap[cluster.indices[j + (j % 3 == 2 ? -2 : 1)]];
+			int v = remap[cluster.indices[j]];
 
-			std::vector<int>& list = edges[std::make_pair(std::min(v0, v1), std::max(v0, v1))];
+			std::vector<int>& list = vertices[v];
 			if (list.empty() || list.back() != int(i))
 				list.push_back(int(i));
 		}
@@ -299,9 +297,9 @@ static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>&
 
 	std::map<std::pair<int, int>, int> adjacency;
 
-	for (std::map<std::pair<int, int>, std::vector<int> >::iterator it = edges.begin(); it != edges.end(); ++it)
+	for (size_t v = 0; v < vertices.size(); ++v)
 	{
-		const std::vector<int>& list = it->second;
+		const std::vector<int>& list = vertices[v];
 
 		for (size_t i = 0; i < list.size(); ++i)
 			for (size_t j = i + 1; j < list.size(); ++j)

From a2f74c6a0e8f36db30be6f3b2904135167e9711f Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Tue, 1 Oct 2024 19:45:31 -0700
Subject: [PATCH 6/6] demo: Use a flexible (and larger) cluster group size

Using 4 clusters is fairly limiting wrt partition quality, as it demands
very careful grouping that METIS is not willing to provide. Instead use
a larger group size (8 atm), and adjust the simplification condition to
be adaptive to the group size so that partitioning is free to produce
groups of varying sizes.
---
 demo/nanite.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/demo/nanite.cpp b/demo/nanite.cpp
index 13f7c2167..868bc1c2a 100644
--- a/demo/nanite.cpp
+++ b/demo/nanite.cpp
@@ -51,6 +51,7 @@ struct Cluster
 };
 
 const size_t kClusterSize = 128;
+const size_t kGroupSize = 8;
 const bool kUseLocks = true;
 
 static LODBounds bounds(const std::vector<Vertex>& vertices, const std::vector<unsigned int>& indices, float error)
@@ -335,7 +336,7 @@ static std::vector<std::vector<int> > partitionMetis(const std::vector<Cluster>&
 
 	int nvtxs = int(pending.size());
 	int ncon = 1;
-	int nparts = int(pending.size() + 3) / 4;
+	int nparts = int(pending.size() + kGroupSize - 1) / kGroupSize;
 	int edgecut = 0;
 
 	if (nparts <= 1)
@@ -375,7 +376,7 @@ static std::vector<std::vector<int> > partition(const std::vector<Cluster>& clus
 	// rough merge; while clusters are approximately spatially ordered, this should use a proper partitioning algorithm
 	for (size_t i = 0; i < pending.size(); ++i)
 	{
-		if (result.empty() || last_indices + clusters[pending[i]].indices.size() > kClusterSize * 4 * 3)
+		if (result.empty() || last_indices + clusters[pending[i]].indices.size() > kClusterSize * kGroupSize * 3)
 		{
 			result.push_back(std::vector<int>());
 			last_indices = 0;
@@ -527,9 +528,10 @@ void nanite(const std::vector<Vertex>& vertices, const std::vector<unsigned int>
 				dumpObj("group", merged);
 			}
 
+			size_t target_size = ((groups[i].size() + 1) / 2) * kClusterSize * 3;
 			float error = 0.f;
-			std::vector<unsigned int> simplified = simplify(vertices, merged, kUseLocks ? &locks : NULL, kClusterSize * 2 * 3, &error);
-			if (simplified.size() > merged.size() * 0.85f || simplified.size() > kClusterSize * 3 * 3)
+			std::vector<unsigned int> simplified = simplify(vertices, merged, kUseLocks ? &locks : NULL, target_size, &error);
+			if (simplified.size() > merged.size() * 0.85f || simplified.size() / (kClusterSize * 3) >= merged.size() / (kClusterSize * 3))
 			{
 #if TRACE
 				printf("stuck cluster: simplified %d => %d over threshold\n", int(merged.size() / 3), int(simplified.size() / 3));