Use fnv1a64 for tile hash (#253)

pull/234/head^2
Michael Barry 2022-06-03 20:44:49 -04:00 zatwierdzone przez GitHub
rodzic c39c667f02
commit b0f634bcaf
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
10 zmienionych plików z 116 dodań i 39 usunięć

Wyświetl plik

@ -11,7 +11,7 @@ import com.onthegomap.planetiler.mbtiles.TileEncodingResult;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.OptionalLong;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -53,7 +53,7 @@ class VerifyMonacoTest {
VectorTile.encodeGeometry(point(0, 0)),
Map.of()
)));
writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalInt.empty()));
writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalLong.empty()));
}
assertInvalid(mbtiles);
}

Wyświetl plik

@ -12,7 +12,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.DoubleSummaryStatistics;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
@ -34,23 +34,23 @@ public class BenchmarkMbtilesWriter {
*/
int distinctTilesInPercent = arguments.getInteger("bench_distinct_tiles", "distinct tiles in percent", 10);
/*
* select avg(length(tile_data))
* from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) = 1) as x
* select avg(length(tile_data))
* from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) = 1) as x
* join tiles_data using(tile_data_id)
* => ~785 (Australia)
*/
int distinctTileDataSize =
arguments.getInteger("bench_distinct_tile_data_size", "distinct tile data size in bytes", 800);
/*
* select avg(length(tile_data))
* from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) > 1) as x
* join tiles_shallow using(tile_data_id)
* select avg(length(tile_data))
* from (select tile_data_id from tiles_shallow group by tile_data_id having count(*) > 1) as x
* join tiles_shallow using(tile_data_id)
* join tiles_data using(tile_data_id)
* => ~93 (Australia)
*/
int dupeTileDataSize = arguments.getInteger("bench_dupe_tile_data_size", "dupe tile data size in bytes", 100);
/*
* select count(*) * 100.0 / sum(usage_count)
* select count(*) * 100.0 / sum(usage_count)
* from (select tile_data_id, count(*) as usage_count from tiles_shallow group by tile_data_id having count(*) > 1)
* => ~0.17% (Australia)
*/
@ -107,11 +107,11 @@ public class BenchmarkMbtilesWriter {
TileCoord coord = TileCoord.ofXYZ(x, y, z);
TileEncodingResult toWrite;
if (tilesWritten % 100 < distinctTilesInPercent) {
toWrite = new TileEncodingResult(coord, distinctTileData, OptionalInt.empty());
toWrite = new TileEncodingResult(coord, distinctTileData, OptionalLong.empty());
} else {
++dupeCounter;
int hash = dupeHashMod == 0 ? 0 : dupeCounter % dupeHashMod;
toWrite = new TileEncodingResult(coord, dupeTileData, OptionalInt.of(hash));
toWrite = new TileEncodingResult(coord, dupeTileData, OptionalLong.of(hash));
}
writer.write(toWrite);

Wyświetl plik

@ -378,12 +378,12 @@ public final class FeatureGroup implements Iterable<FeatureGroup.TileFeatures>,
* <p>
* Used as an optimization to avoid writing the same (ocean) tiles over and over again.
*/
public int generateContentHash() {
int hash = Hashing.FNV1_32_INIT;
public long generateContentHash() {
long hash = Hashing.FNV1_64_INIT;
for (var feature : entries) {
byte layerId = extractLayerIdFromKey(feature.key());
hash = Hashing.fnv1a32(hash, layerId);
hash = Hashing.fnv1a32(hash, feature.value());
hash = Hashing.fnv1a64(hash, layerId);
hash = Hashing.fnv1a64(hash, feature.value());
}
return hash;
}

Wyświetl plik

@ -2,13 +2,14 @@ package com.onthegomap.planetiler.mbtiles;
import static com.fasterxml.jackson.annotation.JsonInclude.Include.NON_ABSENT;
import com.carrotsearch.hppc.IntIntHashMap;
import com.carrotsearch.hppc.LongIntHashMap;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.jdk8.Jdk8Module;
import com.onthegomap.planetiler.geo.GeoUtils;
import com.onthegomap.planetiler.geo.TileCoord;
import com.onthegomap.planetiler.util.Format;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
@ -29,6 +30,7 @@ import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.DoubleStream;
@ -509,6 +511,7 @@ public final class Mbtiles implements Closeable {
private final boolean insertStmtInsertIgnore;
private final String insertStmtValuesPlaceHolder;
private final String insertStmtColumnsCsv;
private long count = 0;
protected BatchedTableWriterBase(String tableName, List<String> columns, boolean insertIgnore) {
@ -523,6 +526,7 @@ public final class Mbtiles implements Closeable {
/** Queue-up a write or flush to disk if enough are waiting. */
void write(T item) {
count++;
batch.add(item);
if (batch.size() >= batchLimit) {
flush(batchStatement);
@ -561,6 +565,10 @@ public final class Mbtiles implements Closeable {
}
}
public long count() {
return count;
}
@Override
public void close() {
if (!batch.isEmpty()) {
@ -660,6 +668,8 @@ public final class Mbtiles implements Closeable {
@Override
void close();
default void printStats() {}
}
private class BatchedNonCompactTileWriter implements BatchedTileWriter {
@ -682,7 +692,7 @@ public final class Mbtiles implements Closeable {
private final BatchedTileShallowTableWriter batchedTileShallowTableWriter = new BatchedTileShallowTableWriter();
private final BatchedTileDataTableWriter batchedTileDataTableWriter = new BatchedTileDataTableWriter();
private final IntIntHashMap tileDataIdByHash = new IntIntHashMap(1_000);
private final LongIntHashMap tileDataIdByHash = new LongIntHashMap(1_000);
private int tileDataIdCounter = 1;
@ -690,10 +700,10 @@ public final class Mbtiles implements Closeable {
public void write(TileEncodingResult encodingResult) {
int tileDataId;
boolean writeData;
OptionalInt tileDataHashOpt = encodingResult.tileDataHash();
OptionalLong tileDataHashOpt = encodingResult.tileDataHash();
if (tileDataHashOpt.isPresent()) {
int tileDataHash = tileDataHashOpt.getAsInt();
long tileDataHash = tileDataHashOpt.getAsLong();
if (tileDataIdByHash.containsKey(tileDataHash)) {
tileDataId = tileDataIdByHash.get(tileDataHash);
writeData = false;
@ -717,6 +727,17 @@ public final class Mbtiles implements Closeable {
batchedTileShallowTableWriter.close();
batchedTileDataTableWriter.close();
}
@Override
public void printStats() {
if (LOGGER.isDebugEnabled()) {
var format = Format.defaultInstance();
LOGGER.debug("Shallow tiles written: {}", format.integer(batchedTileShallowTableWriter.count()));
LOGGER.debug("Tile data written: {} ({} omitted)", format.integer(batchedTileDataTableWriter.count()),
format.percent(1d - batchedTileDataTableWriter.count() * 1d / batchedTileShallowTableWriter.count()));
LOGGER.debug("Unique tile hashes: {}", format.integer(tileDataIdByHash.size()));
}
}
}

Wyświetl plik

@ -27,7 +27,7 @@ import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.Queue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
@ -257,7 +257,7 @@ public class MbtilesWriter {
* recomputing if the input hasn't changed.
*/
byte[] lastBytes = null, lastEncoded = null;
Integer lastTileDataHash = null;
Long lastTileDataHash = null;
boolean compactDb = config.compactDb();
for (TileBatch batch : prev) {
@ -268,7 +268,7 @@ public class MbtilesWriter {
FeatureGroup.TileFeatures tileFeatures = batch.in.get(i);
featuresProcessed.incBy(tileFeatures.getNumFeaturesProcessed());
byte[] bytes, encoded;
Integer tileDataHash;
Long tileDataHash;
if (tileFeatures.hasSameContents(last)) {
bytes = lastBytes;
encoded = lastEncoded;
@ -299,7 +299,7 @@ public class MbtilesWriter {
maxTileSizesByZoom[zoom].accumulate(encodedLength);
result.add(
new TileEncodingResult(tileFeatures.tileCoord(), bytes,
tileDataHash == null ? OptionalInt.empty() : OptionalInt.of(tileDataHash))
tileDataHash == null ? OptionalLong.empty() : OptionalLong.of(tileDataHash))
);
}
// hand result off to writer
@ -361,6 +361,7 @@ public class MbtilesWriter {
}
lastTileWritten.set(lastTile);
}
batchedTileWriter.printStats();
}
if (time != null) {

Wyświetl plik

@ -3,13 +3,13 @@ package com.onthegomap.planetiler.mbtiles;
import com.onthegomap.planetiler.geo.TileCoord;
import java.util.Arrays;
import java.util.Objects;
import java.util.OptionalInt;
import java.util.OptionalLong;
public record TileEncodingResult(
TileCoord coord,
byte[] tileData,
/** will always be empty in non-compact mode and might also be empty in compact mode */
OptionalInt tileDataHash
OptionalLong tileDataHash
) {
@Override

Wyświetl plik

@ -12,6 +12,12 @@ public final class Hashing {
public static final int FNV1_32_INIT = 0x811c9dc5;
private static final int FNV1_PRIME_32 = 16777619;
/**
* Initial hash for the FNV-1 and FNV-1a 64-bit hash function.
*/
public static final long FNV1_64_INIT = 0xcbf29ce484222325L;
private static final long FNV1_PRIME_64 = 1099511628211L;
private Hashing() {}
/**
@ -19,7 +25,7 @@ public final class Hashing {
* <p>
* The hash generation must always start with {@link #FNV1_32_INIT} as initial hash but this version comes in handy
* when generating the hash for multiple bytes consecutively in a loop.
*
*
* @param initHash the initial hash
* @param data the data to generate the hash for
* @return the generated hash
@ -35,7 +41,7 @@ public final class Hashing {
/**
* Computes the hash using the FNV-1a 32-bit hash function.
*
*
* @param data the data to generate the hash for
* @return the hash
*/
@ -43,4 +49,33 @@ public final class Hashing {
return fnv1a32(FNV1_32_INIT, data);
}
/**
* Computes the hash using the FNV-1a 64-bit hash function, starting with the initial hash.
* <p>
* The hash generation must always start with {@link #FNV1_64_INIT} as initial hash but this version comes in handy
* when generating the hash for multiple bytes consecutively in a loop.
*
* @param initHash the initial hash
* @param data the data to generate the hash for
* @return the generated hash
*/
public static long fnv1a64(long initHash, byte... data) {
long hash = initHash;
for (byte datum : data) {
hash ^= (datum & 0xff);
hash *= FNV1_PRIME_64;
}
return hash;
}
/**
* Computes the hash using the FNV-1a 64-bit hash function.
*
* @param data the data to generate the hash for
* @return the hash
*/
public static long fnv1a64(byte... data) {
return fnv1a64(FNV1_64_INIT, data);
}
}

Wyświetl plik

@ -14,7 +14,7 @@ import java.sql.Statement;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.OptionalLong;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
@ -55,7 +55,7 @@ class MbtilesTest {
(byte) (dataBase >> 16),
(byte) (dataBase >> 24)
});
writer.write(new TileEncodingResult(entry.tile(), entry.bytes(), OptionalInt.of(dataHash)));
writer.write(new TileEncodingResult(entry.tile(), entry.bytes(), OptionalLong.of(dataHash)));
expected.add(entry);
}
}

Wyświetl plik

@ -11,7 +11,7 @@ import com.onthegomap.planetiler.geo.TileCoord;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.OptionalInt;
import java.util.OptionalLong;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -53,7 +53,7 @@ class VerifyTest {
VectorTile.encodeGeometry(point(0, 0)),
Map.of()
)));
writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalInt.empty()));
writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalLong.empty()));
}
assertValid(mbtiles);
}
@ -77,7 +77,7 @@ class VerifyTest {
)),
Map.of()
)));
writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalInt.empty()));
writer.write(new TileEncodingResult(TileCoord.ofXYZ(0, 0, 0), gzip(tile.encode()), OptionalLong.empty()));
}
assertInvalid(mbtiles);
}

Wyświetl plik

@ -3,20 +3,40 @@ package com.onthegomap.planetiler.util;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import java.util.function.Function;
import org.junit.jupiter.api.Test;
class HashingTest {
@Test
void testFnv1a32() {
assertEquals(Hashing.fnv1a32(), Hashing.fnv1a32());
assertEquals(Hashing.fnv1a32((byte) 1), Hashing.fnv1a32((byte) 1));
assertEquals(Hashing.fnv1a32((byte) 1, (byte) 2), Hashing.fnv1a32((byte) 1, (byte) 2));
assertNotEquals(Hashing.fnv1a32((byte) 1), Hashing.fnv1a32((byte) 2));
assertNotEquals(Hashing.fnv1a32((byte) 1), Hashing.fnv1a32((byte) 1, (byte) 1));
assertEquals(Hashing.FNV1_32_INIT, Hashing.fnv1a32());
testHasher(Hashing::fnv1a32, Hashing.FNV1_32_INIT);
assertEquals(123, Hashing.fnv1a32(123));
}
@Test
void testFnv1a64() {
testHasher(Hashing::fnv1a64, Hashing.FNV1_64_INIT);
assertEquals(123, Hashing.fnv1a64(123));
}
private static byte[] bytes(int... bytes) {
byte[] result = new byte[bytes.length];
for (int i = 0; i < bytes.length; i++) {
int value = bytes[i];
assert value >= 0 && value < 256;
result[i] = (byte) value;
}
return result;
}
private static <T> void testHasher(Function<byte[], T> hashFn, T init) {
assertEquals(hashFn.apply(bytes()), hashFn.apply(bytes()));
assertEquals(hashFn.apply(bytes(1)), hashFn.apply(bytes(1)));
assertEquals(hashFn.apply(bytes(1, 2)), hashFn.apply(bytes(1, 2)));
assertNotEquals(hashFn.apply(bytes(1)), hashFn.apply(bytes(2)));
assertNotEquals(hashFn.apply(bytes(1)), hashFn.apply(bytes(1, 1)));
assertEquals(init, hashFn.apply(bytes()));
}
}