From b0f634bcaf618e030a3e50506d557f8a8c7b297a Mon Sep 17 00:00:00 2001 From: Michael Barry Date: Fri, 3 Jun 2022 20:44:49 -0400 Subject: [PATCH] Use fnv1a64 for tile hash (#253) --- .../basemap/util/VerifyMonacoTest.java | 4 +- .../benchmarks/BenchmarkMbtilesWriter.java | 18 ++++----- .../planetiler/collection/FeatureGroup.java | 8 ++-- .../planetiler/mbtiles/Mbtiles.java | 29 ++++++++++++-- .../planetiler/mbtiles/MbtilesWriter.java | 9 +++-- .../mbtiles/TileEncodingResult.java | 4 +- .../onthegomap/planetiler/util/Hashing.java | 39 ++++++++++++++++++- .../planetiler/mbtiles/MbtilesTest.java | 4 +- .../planetiler/mbtiles/VerifyTest.java | 6 +-- .../planetiler/util/HashingTest.java | 34 ++++++++++++---- 10 files changed, 116 insertions(+), 39 deletions(-) diff --git a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/VerifyMonacoTest.java b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/VerifyMonacoTest.java index 6e32f458..450edd6b 100644 --- a/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/VerifyMonacoTest.java +++ b/planetiler-basemap/src/test/java/com/onthegomap/planetiler/basemap/util/VerifyMonacoTest.java @@ -11,7 +11,7 @@ import com.onthegomap.planetiler.mbtiles.TileEncodingResult; import java.io.IOException; import java.util.List; import java.util.Map; -import java.util.OptionalInt; +import java.util.OptionalLong; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -53,7 +53,7 @@ class VerifyMonacoTest { VectorTile.encodeGeometry(point(0, 0)), Map.of() ))); - writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalInt.empty())); + writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalLong.empty())); } assertInvalid(mbtiles); } diff --git a/planetiler-benchmarks/src/main/java/com/onthegomap/planetiler/benchmarks/BenchmarkMbtilesWriter.java b/planetiler-benchmarks/src/main/java/com/onthegomap/planetiler/benchmarks/BenchmarkMbtilesWriter.java index 70bb411b..30d0ed8e 100644 --- a/planetiler-benchmarks/src/main/java/com/onthegomap/planetiler/benchmarks/BenchmarkMbtilesWriter.java +++ b/planetiler-benchmarks/src/main/java/com/onthegomap/planetiler/benchmarks/BenchmarkMbtilesWriter.java @@ -12,7 +12,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.DoubleSummaryStatistics; -import java.util.OptionalInt; +import java.util.OptionalLong; import java.util.Random; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; @@ -34,23 +34,23 @@ public class BenchmarkMbtilesWriter { */ int distinctTilesInPercent = arguments.getInteger("bench_distinct_tiles", "distinct tiles in percent", 10); /* - * select avg(length(tile_data)) - * from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) = 1) as x + * select avg(length(tile_data)) + * from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) = 1) as x * join tiles_data using(tile_data_id) * => ~785 (Australia) */ int distinctTileDataSize = arguments.getInteger("bench_distinct_tile_data_size", "distinct tile data size in bytes", 800); /* - * select avg(length(tile_data)) - * from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) > 1) as x - * join tiles_shallow using(tile_data_id) + * select avg(length(tile_data)) + * from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) > 1) as x + * join tiles_shallow using(tile_data_id) * join tiles_data using(tile_data_id) * => ~93 (Australia) */ int dupeTileDataSize = arguments.getInteger("bench_dupe_tile_data_size", "dupe tile data size in bytes", 100); /* - * select count(*) * 100.0 / sum(usage_count) + * select count(*) * 100.0 / sum(usage_count) * from (select tile_data_id, count(*) as usage_count from tiles_shallow group by tile_data_id having count(*) > 1) * => ~0.17% (Australia) */ @@ -107,11 +107,11 @@ public class BenchmarkMbtilesWriter { TileCoord coord = TileCoord.ofXYZ(x, y, z); TileEncodingResult toWrite; if (tilesWritten % 100 < distinctTilesInPercent) { - toWrite = new TileEncodingResult(coord, distinctTileData, OptionalInt.empty()); + toWrite = new TileEncodingResult(coord, distinctTileData, OptionalLong.empty()); } else { ++dupeCounter; int hash = dupeHashMod == 0 ? 0 : dupeCounter % dupeHashMod; - toWrite = new TileEncodingResult(coord, dupeTileData, OptionalInt.of(hash)); + toWrite = new TileEncodingResult(coord, dupeTileData, OptionalLong.of(hash)); } writer.write(toWrite); diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java index 3b1dc373..c152f2d8 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/collection/FeatureGroup.java @@ -378,12 +378,12 @@ public final class FeatureGroup implements Iterable, *

* Used as an optimization to avoid writing the same (ocean) tiles over and over again. */ - public int generateContentHash() { - int hash = Hashing.FNV1_32_INIT; + public long generateContentHash() { + long hash = Hashing.FNV1_64_INIT; for (var feature : entries) { byte layerId = extractLayerIdFromKey(feature.key()); - hash = Hashing.fnv1a32(hash, layerId); - hash = Hashing.fnv1a32(hash, feature.value()); + hash = Hashing.fnv1a64(hash, layerId); + hash = Hashing.fnv1a64(hash, feature.value()); } return hash; } diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/Mbtiles.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/Mbtiles.java index 7a8eb66a..8438755e 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/Mbtiles.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/Mbtiles.java @@ -2,13 +2,14 @@ package com.onthegomap.planetiler.mbtiles; import static com.fasterxml.jackson.annotation.JsonInclude.Include.NON_ABSENT; -import com.carrotsearch.hppc.IntIntHashMap; +import com.carrotsearch.hppc.LongIntHashMap; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; import com.onthegomap.planetiler.geo.GeoUtils; import com.onthegomap.planetiler.geo.TileCoord; +import com.onthegomap.planetiler.util.Format; import java.io.Closeable; import java.io.IOException; import java.nio.file.Path; @@ -29,6 +30,7 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; import java.util.OptionalInt; +import java.util.OptionalLong; import java.util.TreeMap; import java.util.stream.Collectors; import java.util.stream.DoubleStream; @@ -509,6 +511,7 @@ public final class Mbtiles implements Closeable { private final boolean insertStmtInsertIgnore; private final String insertStmtValuesPlaceHolder; private final String insertStmtColumnsCsv; + private long count = 0; protected BatchedTableWriterBase(String tableName, List columns, boolean insertIgnore) { @@ -523,6 +526,7 @@ public final class Mbtiles implements Closeable { /** Queue-up a write or flush to disk if enough are waiting. */ void write(T item) { + count++; batch.add(item); if (batch.size() >= batchLimit) { flush(batchStatement); @@ -561,6 +565,10 @@ public final class Mbtiles implements Closeable { } } + public long count() { + return count; + } + @Override public void close() { if (!batch.isEmpty()) { @@ -660,6 +668,8 @@ public final class Mbtiles implements Closeable { @Override void close(); + + default void printStats() {} } private class BatchedNonCompactTileWriter implements BatchedTileWriter { @@ -682,7 +692,7 @@ public final class Mbtiles implements Closeable { private final BatchedTileShallowTableWriter batchedTileShallowTableWriter = new BatchedTileShallowTableWriter(); private final BatchedTileDataTableWriter batchedTileDataTableWriter = new BatchedTileDataTableWriter(); - private final IntIntHashMap tileDataIdByHash = new IntIntHashMap(1_000); + private final LongIntHashMap tileDataIdByHash = new LongIntHashMap(1_000); private int tileDataIdCounter = 1; @@ -690,10 +700,10 @@ public final class Mbtiles implements Closeable { public void write(TileEncodingResult encodingResult) { int tileDataId; boolean writeData; - OptionalInt tileDataHashOpt = encodingResult.tileDataHash(); + OptionalLong tileDataHashOpt = encodingResult.tileDataHash(); if (tileDataHashOpt.isPresent()) { - int tileDataHash = tileDataHashOpt.getAsInt(); + long tileDataHash = tileDataHashOpt.getAsLong(); if (tileDataIdByHash.containsKey(tileDataHash)) { tileDataId = tileDataIdByHash.get(tileDataHash); writeData = false; @@ -717,6 +727,17 @@ public final class Mbtiles implements Closeable { batchedTileShallowTableWriter.close(); batchedTileDataTableWriter.close(); } + + @Override + public void printStats() { + if (LOGGER.isDebugEnabled()) { + var format = Format.defaultInstance(); + LOGGER.debug("Shallow tiles written: {}", format.integer(batchedTileShallowTableWriter.count())); + LOGGER.debug("Tile data written: {} ({} omitted)", format.integer(batchedTileDataTableWriter.count()), + format.percent(1d - batchedTileDataTableWriter.count() * 1d / batchedTileShallowTableWriter.count())); + LOGGER.debug("Unique tile hashes: {}", format.integer(tileDataIdByHash.size())); + } + } } diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java index b0238dab..c6d3760a 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java @@ -27,7 +27,7 @@ import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.OptionalInt; +import java.util.OptionalLong; import java.util.Queue; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; @@ -257,7 +257,7 @@ public class MbtilesWriter { * recomputing if the input hasn't changed. */ byte[] lastBytes = null, lastEncoded = null; - Integer lastTileDataHash = null; + Long lastTileDataHash = null; boolean compactDb = config.compactDb(); for (TileBatch batch : prev) { @@ -268,7 +268,7 @@ public class MbtilesWriter { FeatureGroup.TileFeatures tileFeatures = batch.in.get(i); featuresProcessed.incBy(tileFeatures.getNumFeaturesProcessed()); byte[] bytes, encoded; - Integer tileDataHash; + Long tileDataHash; if (tileFeatures.hasSameContents(last)) { bytes = lastBytes; encoded = lastEncoded; @@ -299,7 +299,7 @@ public class MbtilesWriter { maxTileSizesByZoom[zoom].accumulate(encodedLength); result.add( new TileEncodingResult(tileFeatures.tileCoord(), bytes, - tileDataHash == null ? OptionalInt.empty() : OptionalInt.of(tileDataHash)) + tileDataHash == null ? OptionalLong.empty() : OptionalLong.of(tileDataHash)) ); } // hand result off to writer @@ -361,6 +361,7 @@ public class MbtilesWriter { } lastTileWritten.set(lastTile); } + batchedTileWriter.printStats(); } if (time != null) { diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/TileEncodingResult.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/TileEncodingResult.java index 66a6f532..a8504f2e 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/TileEncodingResult.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/TileEncodingResult.java @@ -3,13 +3,13 @@ package com.onthegomap.planetiler.mbtiles; import com.onthegomap.planetiler.geo.TileCoord; import java.util.Arrays; import java.util.Objects; -import java.util.OptionalInt; +import java.util.OptionalLong; public record TileEncodingResult( TileCoord coord, byte[] tileData, /** will always be empty in non-compact mode and might also be empty in compact mode */ - OptionalInt tileDataHash + OptionalLong tileDataHash ) { @Override diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Hashing.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Hashing.java index dbfabcaa..45bc476e 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Hashing.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Hashing.java @@ -12,6 +12,12 @@ public final class Hashing { public static final int FNV1_32_INIT = 0x811c9dc5; private static final int FNV1_PRIME_32 = 16777619; + /** + * Initial hash for the FNV-1 and FNV-1a 64-bit hash function. + */ + public static final long FNV1_64_INIT = 0xcbf29ce484222325L; + private static final long FNV1_PRIME_64 = 1099511628211L; + private Hashing() {} /** @@ -19,7 +25,7 @@ public final class Hashing { *

* The hash generation must always start with {@link #FNV1_32_INIT} as initial hash but this version comes in handy * when generating the hash for multiple bytes consecutively in a loop. - * + * * @param initHash the initial hash * @param data the data to generate the hash for * @return the generated hash @@ -35,7 +41,7 @@ public final class Hashing { /** * Computes the hash using the FNV-1a 32-bit hash function. - * + * * @param data the data to generate the hash for * @return the hash */ @@ -43,4 +49,33 @@ public final class Hashing { return fnv1a32(FNV1_32_INIT, data); } + /** + * Computes the hash using the FNV-1a 64-bit hash function, starting with the initial hash. + *

+ * The hash generation must always start with {@link #FNV1_64_INIT} as initial hash but this version comes in handy + * when generating the hash for multiple bytes consecutively in a loop. + * + * @param initHash the initial hash + * @param data the data to generate the hash for + * @return the generated hash + */ + public static long fnv1a64(long initHash, byte... data) { + long hash = initHash; + for (byte datum : data) { + hash ^= (datum & 0xff); + hash *= FNV1_PRIME_64; + } + return hash; + } + + /** + * Computes the hash using the FNV-1a 64-bit hash function. + * + * @param data the data to generate the hash for + * @return the hash + */ + public static long fnv1a64(byte... data) { + return fnv1a64(FNV1_64_INIT, data); + } + } diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/MbtilesTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/MbtilesTest.java index fcc32268..605c8e0a 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/MbtilesTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/MbtilesTest.java @@ -14,7 +14,7 @@ import java.sql.Statement; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.OptionalInt; +import java.util.OptionalLong; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; @@ -55,7 +55,7 @@ class MbtilesTest { (byte) (dataBase >> 16), (byte) (dataBase >> 24) }); - writer.write(new TileEncodingResult(entry.tile(), entry.bytes(), OptionalInt.of(dataHash))); + writer.write(new TileEncodingResult(entry.tile(), entry.bytes(), OptionalLong.of(dataHash))); expected.add(entry); } } diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/VerifyTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/VerifyTest.java index 4fe2a82e..843ce778 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/VerifyTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/mbtiles/VerifyTest.java @@ -11,7 +11,7 @@ import com.onthegomap.planetiler.geo.TileCoord; import java.io.IOException; import java.util.List; import java.util.Map; -import java.util.OptionalInt; +import java.util.OptionalLong; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -53,7 +53,7 @@ class VerifyTest { VectorTile.encodeGeometry(point(0, 0)), Map.of() ))); - writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalInt.empty())); + writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalLong.empty())); } assertValid(mbtiles); } @@ -77,7 +77,7 @@ class VerifyTest { )), Map.of() ))); - writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalInt.empty())); + writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalLong.empty())); } assertInvalid(mbtiles); } diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/HashingTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/HashingTest.java index 55588612..9432f198 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/HashingTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/HashingTest.java @@ -3,20 +3,40 @@ package com.onthegomap.planetiler.util; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotEquals; +import java.util.function.Function; import org.junit.jupiter.api.Test; class HashingTest { @Test void testFnv1a32() { - assertEquals(Hashing.fnv1a32(), Hashing.fnv1a32()); - assertEquals(Hashing.fnv1a32((byte) 1), Hashing.fnv1a32((byte) 1)); - assertEquals(Hashing.fnv1a32((byte) 1, (byte) 2), Hashing.fnv1a32((byte) 1, (byte) 2)); - assertNotEquals(Hashing.fnv1a32((byte) 1), Hashing.fnv1a32((byte) 2)); - assertNotEquals(Hashing.fnv1a32((byte) 1), Hashing.fnv1a32((byte) 1, (byte) 1)); - - assertEquals(Hashing.FNV1_32_INIT, Hashing.fnv1a32()); + testHasher(Hashing::fnv1a32, Hashing.FNV1_32_INIT); assertEquals(123, Hashing.fnv1a32(123)); } + @Test + void testFnv1a64() { + testHasher(Hashing::fnv1a64, Hashing.FNV1_64_INIT); + assertEquals(123, Hashing.fnv1a64(123)); + } + + private static byte[] bytes(int... bytes) { + byte[] result = new byte[bytes.length]; + for (int i = 0; i < bytes.length; i++) { + int value = bytes[i]; + assert value >= 0 && value < 256; + result[i] = (byte) value; + } + return result; + } + + private static void testHasher(Function hashFn, T init) { + assertEquals(hashFn.apply(bytes()), hashFn.apply(bytes())); + assertEquals(hashFn.apply(bytes(1)), hashFn.apply(bytes(1))); + assertEquals(hashFn.apply(bytes(1, 2)), hashFn.apply(bytes(1, 2))); + assertNotEquals(hashFn.apply(bytes(1)), hashFn.apply(bytes(2))); + assertNotEquals(hashFn.apply(bytes(1)), hashFn.apply(bytes(1, 1))); + + assertEquals(init, hashFn.apply(bytes())); + } }