Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion augmented_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,10 @@ def elem(self):
if nds:
bounds = Bounds()
for nd in nds:
bounds.add(float(nd.get("lon")), float(nd.get("lat")))
lon = nd.get("lon")
lat = nd.get("lat")
if lon is not None and lat is not None:
bounds.add(float(nd.get("lon")), float(nd.get("lat")))
osm_obj.insert(0, bounds.elem())

eprint(f"Pass 5: {time.time() - pass_5_start_time:.3f}s")
Expand Down
27 changes: 15 additions & 12 deletions gc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,20 @@
# so if a changeset hasn't been modified in at least that long, we can
# safely assume that it won't change again in the future.

find stage-data/changesets/ -type f -mtime +3 -printf '%P\n' \
| cut -d '.' -f1 \
| while read changeset_id; do
echo "removing files for changeset $changeset_id"
# atomically move the split files to a temporary location before deleting
# them (prevents merge_adiffs.py being run during the deletion, which could
# result in an incomplete adiff being generated)
tmpdir=$(mktemp -d)
mv stage-data/split-adiffs/$changeset_id/ $tmpdir
rm -rf $tmpdir
SPLIT_ADIFFS_DIR=${1:-'stage-data/split-adiffs'}
CHANGESET_DIR=${2:-'stage-data/changesets'}

# also delete the stamp file
rm stage-data/changesets/$changeset_id.adiff.md5

find "$CHANGESET_DIR" -type f -name "*.adiff.md5" -mtime +3 | while read stampfile; do
changeset_id=$(basename "$stampfile" .adiff.md5)
echo "removing files for changeset $changeset_id"
# atomically move the split files to a temporary location before deleting
# them (prevents merge_adiffs.py being run during the deletion, which could
# result in an incomplete adiff being generated)
tmpdir=$(mktemp -d)
mv $SPLIT_ADIFFS_DIR/$changeset_id/ $tmpdir
rm -rf $tmpdir

# also delete the stamp file
rm $CHANGESET_DIR/$changeset_id.adiff.md5
done
20 changes: 13 additions & 7 deletions merge.mk
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,23 @@
.ONESHELL:
.SECONDEXPANSION:

SPLIT_ADIFFS_DIR ?= stage-data/split-adiffs
CHANGESET_DIR ?= stage-data/changesets
BUCKET_DIR ?= bucket-data/replication/minute
API_URL ?= https://api.openstreetmap.org


MAKEDIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))

# all: metadatas changesets
all: changesets

changesets: $(shell find stage-data/split-adiffs/ -mindepth 1 -type d -printf 'stage-data/changesets/%P.adiff.md5\n')
changesets: $(shell find $(SPLIT_ADIFFS_DIR)/ -mindepth 1 -type d | sed 's|$(SPLIT_ADIFFS_DIR)|$(CHANGESET_DIR)/|g' | sed 's|$$|.adiff.md5|')

# bucket-data/changesets/%.adiff: $$(wildcard stage-data/split-adiffs/%/*)
stage-data/changesets/%.adiff.md5: $$(wildcard stage-data/split-adiffs/%/*)
$(CHANGESET_DIR)/%.adiff.md5: $$(wildcard $(SPLIT_ADIFFS_DIR)/%/*)
tmpfile=$$(mktemp)
merge_adiffs.py $^ | xmlstarlet format > $$tmpfile
python merge_adiffs.py $^ | xmlstarlet format > $$tmpfile
if [ -s $$tmpfile ]; then
# merge_adiffs.py can fail if it is given no input files or if one or more
# of its input files are not found. Either of these can happen if the input
Expand All @@ -27,10 +33,10 @@ stage-data/changesets/%.adiff.md5: $$(wildcard stage-data/split-adiffs/%/*)
# stamp file if the merged output file is nonempty (-s).
md5sum < $$tmpfile > $@
gzip -c < $$tmpfile > $$tmpfile.gz
mv $$tmpfile.gz bucket-data/changesets/$*.adiff && rm $$tmpfile
mv $$tmpfile.gz $(BUCKET_DIR)/$*.adiff && rm $$tmpfile
fi

metadatas: $(shell find stage-data/split-adiffs/ -mindepth 1 -type d -printf '%p/metadata.xml\n')
metadatas: $(shell find $(SPLIT_ADIFFS_DIR)/ -mindepth 1 -type d | sed 's|$$|/metadata.xml|')

stage-data/split-adiffs/%/metadata.xml:
curl -sL https://api.openstreetmap.org/api/0.6/changeset/$* -o $@
$(SPLIT_ADIFFS_DIR)/%/metadata.xml:
curl -sL "$(API_URL)/api/0.6/changeset/$*" -o $@
58 changes: 46 additions & 12 deletions process.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/sh

# Usage: process.sh
# Usage: process.sh [replication_dir] [split_dir] [changeset_dir] [bucket_dir] [api_url] [minutes_filter]
#
# Does the following:
# - splits all adiffs in stage-data/replication-adiffs/*.adiff into stage-data/split-adiffs/*/
Expand All @@ -11,19 +11,47 @@
# works (the scripts are installed into /mnt/osmcha/bin). In your own deployment you
# should ensure that the scripts (specifically split_adiff.py, merge_adiff.py, and
# merge.mk) are in your $PATH, because process.sh assumes they are.
export PATH=$PATH:/mnt/osmcha/bin

for adiff_file in $(find stage-data/replication-adiffs/ -type f); do
# export PATH=$PATH:/mnt/osmcha/bin # Is this really necesary, seems not

REPLICATION_ADIFFS_DIR=${1:-'stage-data/replication-adiffs'}
SPLIT_ADIFFS_DIR=${2:-'stage-data/split-adiffs'}
CHANGESET_DIR=${3:-'stage-data/changesets'}
BUCKET_DIR=${4:-'bucket-data/replication/minute'}
API_URL=${5:-'https://api.openstreetmap.org'}
FILTER_ADIFF_FILES=${6:-''}


# Determine which .adiff files to process
if [ -n "$FILTER_ADIFF_FILES" ]; then
echo "Filtering .adiff files modified in the last $FILTER_ADIFF_FILES minutes..."
adiff_files=$(find "$REPLICATION_ADIFFS_DIR" -type f -mmin -"$FILTER_ADIFF_FILES")
else
echo "Processing all .adiff files in $REPLICATION_ADIFFS_DIR..."
adiff_files=$(find "$REPLICATION_ADIFFS_DIR" -type f)
fi

for adiff_file in $adiff_files; do

seqno=$(basename -s .adiff $adiff_file)
tmpdir=$(mktemp -d)

# split the adiff file
split_adiff.py $adiff_file $tmpdir
python split_adiff.py $adiff_file $tmpdir

# Check if adiff files are been generated
if [ -z "$(ls -A "$tmpdir"/*.adiff 2>/dev/null)" ]; then
echo "No .adiff files generated from $adiff_file — skipping"
rm -rf "$tmpdir"
continue
fi

for file in $tmpdir/*.adiff; do
changeset=$(basename -s .adiff $file)
mkdir -p stage-data/split-adiffs/$changeset/
mv $file stage-data/split-adiffs/$changeset/$seqno.adiff
echo "Changeset $changeset has $seqno adiffs"
# move the adiff file into place
mkdir -p $SPLIT_ADIFFS_DIR/$changeset/
mv $file $SPLIT_ADIFFS_DIR/$changeset/$seqno.adiff
done

rm -rf $tmpdir
Expand All @@ -39,10 +67,16 @@ for adiff_file in $(find stage-data/replication-adiffs/ -type f); do
rm $adiff_file
done

# merge all our split files, potentially updating existing changesets.
# this is done using a makefile script in order to avoid needlessly reprocessing
# changesets whose set of input (split-adiffs/) files haven't changed.
merge.mk
# # merge all our split files, potentially updating existing changesets.
# # this is done using a makefile script in order to avoid needlessly reprocessing
# # changesets whose set of input (split-adiffs/) files haven't changed.
# merge.mk
make -f merge.mk \
REPLICATION_ADIFFS_DIR="${REPLICATION_ADIFFS_DIR}" \
SPLIT_ADIFFS_DIR="${SPLIT_ADIFFS_DIR}" \
CHANGESET_DIR="${CHANGESET_DIR}" \
BUCKET_DIR="${BUCKET_DIR}" \
API_URL="${API_URL}"

# clean up old stage-data that we don't need anymore
gc.sh
# # clean up old stage-data that we don't need anymore
./gc.sh "${SPLIT_ADIFFS_DIR}" "${CHANGESET_DIR}"
23 changes: 17 additions & 6 deletions update.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,32 @@
# should run this script in a cron job once per minute. It is good practice
# to use flock to ensure that only one instance of the job is running at a time.

set -ex
export PATH=$PATH:/home/osmx/osmx-adiff-builder
set -e
export PATH=$PATH:/${PWD}

## Initial sequence number — this is only required at the beginning. right after the osmx database is created
OSMX_DB_PATH=$1
REPLICATION_ADIFFS_DIR=$2
INITIAL_SEQNUM=$3

eval "$(mise activate bash --shims)"

osm replication minute --seqno $(osmx query $1 seqnum) \
if [ -n "$INITIAL_SEQNUM" ]; then
seqno_start="$INITIAL_SEQNUM"
else
seqno_start="$(osmx query "$OSMX_DB_PATH" seqnum)"
fi

osm replication minute --seqno $seqno_start \
| while read seqno timestamp url; do
test -z "$seqno" && continue # skip blank lines or empty output

curl -sL $url | gzip -d > $seqno.osc
tmpfile=$(mktemp)

augmented_diff.py $1 $seqno.osc | xmlstarlet format > $tmpfile
mv $tmpfile $2/$seqno.adiff
augmented_diff.py $OSMX_DB_PATH $seqno.osc | xmlstarlet format > $tmpfile
mv $tmpfile $REPLICATION_ADIFFS_DIR/$seqno.adiff

osmx update $1 $seqno.osc $seqno $timestamp --commit
osmx update $OSMX_DB_PATH $seqno.osc $seqno $timestamp --commit
rm $seqno.osc
done