Lazy primitive group deserialization (#309)

pull/311/head
Michael Barry 2022-07-28 07:56:41 -04:00 zatwierdzone przez GitHub
rodzic 7109450265
commit 8a8db0005d
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
14 zmienionych plików z 22630 dodań i 28 usunięć

Wyświetl plik

@ -20,7 +20,7 @@ The `planetiler-core` module includes the following software:
- org.xerial:sqlite-jdbc (Apache license)
- com.ibm.icu:icu4j ([ICU license](https://github.com/unicode-org/icu/blob/main/icu4c/LICENSE))
- com.google.guava:guava (Apache license)
- org.openstreetmap.osmosis:osmosis-osm-binary (LGPL 3.0)
- com.google.protobuf:protobuf-java (BSD 3-Clause License)
- com.carrotsearch:hppc (Apache license)
- com.github.jnr:jnr-ffi (Apache license)
- org.roaringbitmap:RoaringBitmap (Apache license)
@ -37,6 +37,8 @@ The `planetiler-core` module includes the following software:
- `ArrayLongMinHeap` implementations from [graphhopper](https://github.com/graphhopper/graphhopper) (Apache license)
- `Hilbert` implementation
from [github.com/rawrunprotected/hilbert_curves](https://github.com/rawrunprotected/hilbert_curves) (Public Domain)
- `osmformat.proto` and `fileformat.proto` (generates `Osmformat.java` and `Fileformat.java`)
from [openstreetmap/OSM-binary](https://github.com/openstreetmap/OSM-binary/tree/master/osmpbf) (MIT License)
Additionally, the `planetiler-basemap` module is based on [OpenMapTiles](https://github.com/openmaptiles/openmaptiles):

Wyświetl plik

@ -33,9 +33,9 @@
<version>0.9.30</version>
</dependency>
<dependency>
<groupId>org.openstreetmap.osmosis</groupId>
<artifactId>osmosis-osm-binary</artifactId>
<version>0.48.3</version>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>3.21.3</version>
</dependency>
<dependency>
<groupId>com.github.jnr</groupId>

Wyświetl plik

@ -5,6 +5,7 @@ import com.onthegomap.planetiler.reader.FileFormatException;
import com.onthegomap.planetiler.util.ByteBufferUtil;
import com.onthegomap.planetiler.util.DiskBacked;
import com.onthegomap.planetiler.util.FileUtils;
import crosby.binary.Fileformat.BlobHeader;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.ByteBuffer;
@ -15,7 +16,6 @@ import java.util.List;
import java.util.function.Consumer;
import java.util.function.Supplier;
import org.locationtech.jts.geom.Envelope;
import org.openstreetmap.osmosis.osmbinary.Fileformat.BlobHeader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Wyświetl plik

@ -5,7 +5,6 @@ import static com.onthegomap.planetiler.worker.Worker.joinFutures;
import com.carrotsearch.hppc.IntObjectHashMap;
import com.carrotsearch.hppc.LongArrayList;
import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongObjectHashMap;
import com.carrotsearch.hppc.ObjectIntHashMap;
import com.onthegomap.planetiler.FeatureCollector;
@ -53,6 +52,7 @@ import org.locationtech.jts.geom.Point;
import org.locationtech.jts.geom.Polygon;
import org.locationtech.jts.geom.impl.CoordinateArraySequence;
import org.locationtech.jts.geom.impl.PackedCoordinateSequence;
import org.roaringbitmap.longlong.Roaring64Bitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -86,7 +86,7 @@ public class OsmReader implements Closeable, MemoryEstimator.HasEstimate {
private final Object wayToRelationsLock = new Object();
// for multipolygons need to store way info (20m ways, 800m nodes) to use when processing relations (4.5m)
// ~300mb
private LongHashSet waysInMultipolygon = new LongHashSet();
private Roaring64Bitmap waysInMultipolygon = new Roaring64Bitmap();
private final Object waysInMultipolygonLock = new Object();
// ~7GB
private LongLongMultimap.Replaceable multipolygonWayGeometries;
@ -541,7 +541,7 @@ public class OsmReader implements Closeable, MemoryEstimator.HasEstimate {
@Override
public long estimateMemoryUsageBytes() {
long size = 0;
size += estimateSize(waysInMultipolygon);
size += waysInMultipolygon.serializedSizeInBytes();
// multipolygonWayGeometries is reported separately
size += estimateSize(wayToRelations);
size += estimateSize(relationInfo);

Wyświetl plik

@ -4,7 +4,10 @@ package com.onthegomap.planetiler.reader.osm;
import com.carrotsearch.hppc.LongArrayList;
import com.google.common.collect.Iterators;
import com.google.protobuf.InvalidProtocolBufferException;
import com.onthegomap.planetiler.reader.FileFormatException;
import crosby.binary.Fileformat;
import crosby.binary.Osmformat;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.ByteBuffer;
@ -20,8 +23,6 @@ import java.util.function.IntUnaryOperator;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import org.locationtech.jts.geom.Envelope;
import org.openstreetmap.osmosis.osmbinary.Fileformat;
import org.openstreetmap.osmosis.osmbinary.Osmformat;
/**
* Converts PBF block data into decoded entities. This class was adapted from Osmosis to expose an iterator over blocks
@ -127,12 +128,7 @@ public class PbfDecoder implements Iterable<OsmElement> {
@Override
public Iterator<OsmElement> iterator() {
return Iterators.concat(block.getPrimitivegroupList().stream().map(primitiveGroup -> Iterators.concat(
new DenseNodeIterator(primitiveGroup.getDense()),
new NodeIterator(primitiveGroup.getNodesList()),
new WayIterator(primitiveGroup.getWaysList()),
new RelationIterator(primitiveGroup.getRelationsList())
)).iterator());
return Iterators.concat(new PrimitiveGroupIterator());
}
private Map<String, Object> buildTags(int num, IntUnaryOperator key, IntUnaryOperator value) {
@ -148,6 +144,43 @@ public class PbfDecoder implements Iterable<OsmElement> {
return Collections.emptyMap();
}
private OsmElement.Info parseInfo(Osmformat.Info info) {
return info == null ? null : new OsmElement.Info(
info.getChangeset(),
info.getTimestamp(),
info.getUid(),
info.getVersion(),
fieldDecoder.decodeString(info.getUserSid())
);
}
private class PrimitiveGroupIterator implements Iterator<Iterator<OsmElement>> {
private int i = 0;
@Override
public boolean hasNext() {
return i < block.getPrimitivegroupCount();
}
@Override
public Iterator<OsmElement> next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
try {
var primitiveGroup = Osmformat.PrimitiveGroup.parseFrom(block.getPrimitivegroup(i++));
return Iterators.concat(
new DenseNodeIterator(primitiveGroup.getDense()),
new NodeIterator(primitiveGroup.getNodesList()),
new WayIterator(primitiveGroup.getWaysList()),
new RelationIterator(primitiveGroup.getRelationsList())
);
} catch (InvalidProtocolBufferException e) {
throw new FileFormatException("Unable to parse primitive group", e);
}
}
}
private class NodeIterator implements Iterator<OsmElement.Node> {
private final List<Osmformat.Node> nodes;
@ -273,16 +306,6 @@ public class PbfDecoder implements Iterable<OsmElement> {
}
}
private OsmElement.Info parseInfo(Osmformat.Info info) {
return info == null ? null : new OsmElement.Info(
info.getChangeset(),
info.getTimestamp(),
info.getUid(),
info.getVersion(),
fieldDecoder.decodeString(info.getUserSid())
);
}
private class DenseNodeIterator implements Iterator<OsmElement.Node> {
final Osmformat.DenseNodes nodes;

Wyświetl plik

@ -2,8 +2,8 @@
// See NOTICE.md here or copying.txt from https://github.com/openstreetmap/osmosis/blob/master/package/copying.txt for details.
package com.onthegomap.planetiler.reader.osm;
import crosby.binary.Osmformat;
import java.util.Date;
import org.openstreetmap.osmosis.osmbinary.Osmformat;
/**
* Manages decoding of the lower level PBF data structures.

Wyświetl plik

@ -0,0 +1,68 @@
/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
syntax = "proto2";
option java_package = "crosby.binary";
package OSMPBF;
//protoc --java_out=../.. fileformat.proto
//
// STORAGE LAYER: Storing primitives.
//
message Blob {
optional int32 raw_size = 2; // When compressed, the uncompressed size
oneof data {
bytes raw = 1; // No compression
// Possible compressed versions of the data.
bytes zlib_data = 3;
// For LZMA compressed data (optional)
bytes lzma_data = 4;
// Formerly used for bzip2 compressed data. Deprecated in 2010.
bytes OBSOLETE_bzip2_data = 5 [deprecated = true]; // Don't reuse this tag number.
// For LZ4 compressed data (optional)
bytes lz4_data = 6;
// For ZSTD compressed data (optional)
bytes zstd_data = 7;
}
}
/* A file contains an sequence of fileblock headers, each prefixed by
their length in network byte order, followed by a data block
containing the actual data. Types starting with a "_" are reserved.
*/
message BlobHeader {
required string type = 1;
optional bytes indexdata = 2;
required int32 datasize = 3;
}

Wyświetl plik

@ -0,0 +1,272 @@
/** Copyright (c) 2010 Scott A. Crosby. <scott@sacrosby.com>
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
This file is copied from https://raw.githubusercontent.com/openstreetmap/OSM-binary/master/osmpbf/osmformat.proto
(MIT License) and modified to allow for lazy deserialization of primitive groups.
*/
syntax = "proto2";
option java_package = "crosby.binary";
package OSMPBF;
/* OSM Binary file format
This is the master schema file of the OSM binary file format. This
file is designed to support limited random-access and future
extendability.
A binary OSM file consists of a sequence of FileBlocks (please see
fileformat.proto). The first fileblock contains a serialized instance
of HeaderBlock, followed by a sequence of PrimitiveBlock blocks that
contain the primitives.
Each primitiveblock is designed to be independently parsable. It
contains a string table storing all strings in that block (keys and
values in tags, roles in relations, usernames, etc.) as well as
metadata containing the precision of coordinates or timestamps in that
block.
A primitiveblock contains a sequence of primitive groups, each
containing primitives of the same type (nodes, densenodes, ways,
relations). Coordinates are stored in signed 64-bit integers. Lat&lon
are measured in units <granularity> nanodegrees. The default of
granularity of 100 nanodegrees corresponds to about 1cm on the ground,
and a full lat or lon fits into 32 bits.
Converting an integer to a latitude or longitude uses the formula:
$OUT = IN * granularity / 10**9$. Many encoding schemes use delta
coding when representing nodes and relations.
*/
//////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////
/* Contains the file header. */
message HeaderBlock {
optional HeaderBBox bbox = 1;
/* Additional tags to aid in parsing this dataset */
repeated string required_features = 4;
repeated string optional_features = 5;
optional string writingprogram = 16;
optional string source = 17; // From the bbox field.
/* Tags that allow continuing an Osmosis replication */
// Replication timestamp, expressed in seconds since the epoch,
// otherwise the same value as in the "timestamp=..." field
// in the state.txt file used by Osmosis.
optional int64 osmosis_replication_timestamp = 32;
// Replication sequence number (sequenceNumber in state.txt).
optional int64 osmosis_replication_sequence_number = 33;
// Replication base URL (from Osmosis' configuration.txt file).
optional string osmosis_replication_base_url = 34;
}
/** The bounding box field in the OSM header. BBOX, as used in the OSM
header. Units are always in nanodegrees -- they do not obey
granularity rules. */
message HeaderBBox {
required sint64 left = 1;
required sint64 right = 2;
required sint64 top = 3;
required sint64 bottom = 4;
}
///////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////
message PrimitiveBlock {
required StringTable stringtable = 1;
// [PLANETILER] modified from original:
// repeated PrimitiveGroup primitivegroup = 2;
// so that we can lazily deserialize each primitive group to reduce memory
// pressure when reading an input file
repeated bytes primitivegroup = 2;
// Granularity, units of nanodegrees, used to store coordinates in this block.
optional int32 granularity = 17 [default = 100];
// Offset value between the output coordinates and the granularity grid in units of nanodegrees.
optional int64 lat_offset = 19 [default = 0];
optional int64 lon_offset = 20 [default = 0];
// Granularity of dates, normally represented in units of milliseconds since the 1970 epoch.
optional int32 date_granularity = 18 [default = 1000];
}
// Group of OSMPrimitives. All primitives in a group must be the same type.
message PrimitiveGroup {
repeated Node nodes = 1;
optional DenseNodes dense = 2;
repeated Way ways = 3;
repeated Relation relations = 4;
repeated ChangeSet changesets = 5;
}
/** String table, contains the common strings in each block.
Note that we reserve index '0' as a delimiter, so the entry at that
index in the table is ALWAYS blank and unused.
*/
message StringTable {
repeated bytes s = 1;
}
/* Optional metadata that may be included into each primitive. */
message Info {
optional int32 version = 1 [default = -1];
optional int64 timestamp = 2;
optional int64 changeset = 3;
optional int32 uid = 4;
optional uint32 user_sid = 5; // String IDs
// The visible flag is used to store history information. It indicates that
// the current object version has been created by a delete operation on the
// OSM API.
// When a writer sets this flag, it MUST add a required_features tag with
// value "HistoricalInformation" to the HeaderBlock.
// If this flag is not available for some object it MUST be assumed to be
// true if the file has the required_features tag "HistoricalInformation"
// set.
optional bool visible = 6;
}
/** Optional metadata that may be included into each primitive. Special dense format used in DenseNodes. */
message DenseInfo {
repeated int32 version = 1 [packed = true];
repeated sint64 timestamp = 2 [packed = true]; // DELTA coded
repeated sint64 changeset = 3 [packed = true]; // DELTA coded
repeated sint32 uid = 4 [packed = true]; // DELTA coded
repeated sint32 user_sid = 5 [packed = true]; // String IDs for usernames. DELTA coded
// The visible flag is used to store history information. It indicates that
// the current object version has been created by a delete operation on the
// OSM API.
// When a writer sets this flag, it MUST add a required_features tag with
// value "HistoricalInformation" to the HeaderBlock.
// If this flag is not available for some object it MUST be assumed to be
// true if the file has the required_features tag "HistoricalInformation"
// set.
repeated bool visible = 6 [packed = true];
}
// This is kept for backwards compatibility but not used anywhere.
message ChangeSet {
required int64 id = 1;
}
message Node {
required sint64 id = 1;
// Parallel arrays.
repeated uint32 keys = 2 [packed = true]; // String IDs.
repeated uint32 vals = 3 [packed = true]; // String IDs.
optional Info info = 4; // May be omitted in omitmeta
required sint64 lat = 8;
required sint64 lon = 9;
}
/* Used to densly represent a sequence of nodes that do not have any tags.
We represent these nodes columnwise as five columns: ID's, lats, and
lons, all delta coded. When metadata is not omitted,
We encode keys & vals for all nodes as a single array of integers
containing key-stringid and val-stringid, using a stringid of 0 as a
delimiter between nodes.
( (<keyid> <valid>)* '0' )*
*/
message DenseNodes {
repeated sint64 id = 1 [packed = true]; // DELTA coded
optional DenseInfo denseinfo = 5;
repeated sint64 lat = 8 [packed = true]; // DELTA coded
repeated sint64 lon = 9 [packed = true]; // DELTA coded
// Special packing of keys and vals into one array. May be empty if all nodes in this block are tagless.
repeated int32 keys_vals = 10 [packed = true];
}
message Way {
required int64 id = 1;
// Parallel arrays.
repeated uint32 keys = 2 [packed = true];
repeated uint32 vals = 3 [packed = true];
optional Info info = 4;
repeated sint64 refs = 8 [packed = true]; // DELTA coded
// The following two fields are optional. They are only used in a special
// format where node locations are also added to the ways. This makes the
// files larger, but allows creating way geometries directly.
//
// If this is used, you MUST set the optional_features tag "LocationsOnWays"
// and the number of values in refs, lat, and lon MUST be the same.
repeated sint64 lat = 9 [packed = true]; // DELTA coded, optional
repeated sint64 lon = 10 [packed = true]; // DELTA coded, optional
}
message Relation {
enum MemberType {
NODE = 0;
WAY = 1;
RELATION = 2;
}
required int64 id = 1;
// Parallel arrays.
repeated uint32 keys = 2 [packed = true];
repeated uint32 vals = 3 [packed = true];
optional Info info = 4;
// Parallel arrays
repeated int32 roles_sid = 8 [packed = true]; // This should have been defined as uint32 for consistency, but it is now too late to change it
repeated sint64 memids = 9 [packed = true]; // DELTA encoded
repeated MemberType types = 10 [packed = true];
}

Wyświetl plik

@ -26,7 +26,9 @@
<sonar.organization>onthegomap</sonar.organization>
<sonar.projectKey>onthegomap_planetiler</sonar.projectKey>
<sonar.moduleKey>${project.artifactId}</sonar.moduleKey>
<sonar.exclusions>planetiler-benchmarks/**/*, **/VectorTileProto.java, **/generated/*.java</sonar.exclusions>
<sonar.exclusions>planetiler-benchmarks/**/*, **/VectorTileProto.java, **/crosby/binary/*.java,
**/generated/*.java
</sonar.exclusions>
</properties>
<scm>

Wyświetl plik

@ -3,6 +3,8 @@ set -ex
echo "Regenerating..."
protoc --java_out=annotate_code:planetiler-core/src/main/java/ planetiler-core/src/main/resources/vector_tile_proto.proto
protoc --java_out=annotate_code:planetiler-core/src/main/java/ planetiler-core/src/main/resources/fileformat.proto
protoc --java_out=annotate_code:planetiler-core/src/main/java/ planetiler-core/src/main/resources/osmformat.proto
echo "Formatting..."
./scripts/format.sh