From b51d64538f163246c3f2bc10709958310ac0a667 Mon Sep 17 00:00:00 2001 From: Rub21 Date: Wed, 6 Aug 2025 12:34:16 -0500 Subject: [PATCH 1/5] Use $PWD for loading the local folder --- update.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 update.sh diff --git a/update.sh b/update.sh old mode 100644 new mode 100755 index e205c46..4f4fc54 --- a/update.sh +++ b/update.sh @@ -12,7 +12,7 @@ # to use flock to ensure that only one instance of the job is running at a time. set -ex -export PATH=$PATH:/home/osmx/osmx-adiff-builder +export PATH=$PATH:/${PWD} eval "$(mise activate bash --shims)" From bcc2f4c0d8d3152565138190e10742757d42a80e Mon Sep 17 00:00:00 2001 From: Rub21 Date: Wed, 6 Aug 2025 12:44:20 -0500 Subject: [PATCH 2/5] Add INITIAL_SEQNUM to create diff files and update osmx file --- update.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/update.sh b/update.sh index 4f4fc54..14bcb23 100755 --- a/update.sh +++ b/update.sh @@ -16,7 +16,13 @@ export PATH=$PATH:/${PWD} eval "$(mise activate bash --shims)" -osm replication minute --seqno $(osmx query $1 seqnum) \ +if [ -n "$INITIAL_SEQNUM" ]; then + seqno_start="$INITIAL_SEQNUM" +else + seqno_start="$(osmx query "$1" seqnum)" +fi + +osm replication minute --seqno $seqno_start \ | while read seqno timestamp url; do test -z "$seqno" && continue # skip blank lines or empty output From bd457a5e5904ae7590c7be363c43ce440bfef2d3 Mon Sep 17 00:00:00 2001 From: Rub21 Date: Wed, 6 Aug 2025 16:18:18 -0500 Subject: [PATCH 3/5] Update path for different dirs --- gc.sh | 26 ++++++++++++++------------ merge.mk | 23 ++++++++++++++++------- process.sh | 43 +++++++++++++++++++++++++++++++------------ update.sh | 17 +++++++++++++---- 4 files changed, 74 insertions(+), 35 deletions(-) diff --git a/gc.sh b/gc.sh index a9f0a4f..44c358b 100755 --- a/gc.sh +++ b/gc.sh @@ -6,18 +6,20 @@ # The openstreetmap.org server automatically closes changesets after 24h, # so if a changeset hasn't been modified in at least that long, we can # safely assume that it won't change again in the future. +WORKDIR=/data +CHANGESET_DIR=$WORKDIR/stage-data/changesets +SPLIT_ADIFFS=$WORKDIR/stage-data/split-adiffs -find stage-data/changesets/ -type f -mtime +3 -printf '%P\n' \ - | cut -d '.' -f1 \ - | while read changeset_id; do - echo "removing files for changeset $changeset_id" - # atomically move the split files to a temporary location before deleting - # them (prevents merge_adiffs.py being run during the deletion, which could - # result in an incomplete adiff being generated) - tmpdir=$(mktemp -d) - mv stage-data/split-adiffs/$changeset_id/ $tmpdir - rm -rf $tmpdir +find "$CHANGESET_DIR" -type f -name "*.adiff.md5" -mtime +3 | while read stampfile; do + changeset_id=$(basename "$stampfile" .adiff.md5) + echo "removing files for changeset $changeset_id" + # atomically move the split files to a temporary location before deleting + # them (prevents merge_adiffs.py being run during the deletion, which could + # result in an incomplete adiff being generated) + tmpdir=$(mktemp -d) + mv $SPLIT_ADIFFS/$changeset_id/ $tmpdir + rm -rf $tmpdir - # also delete the stamp file - rm stage-data/changesets/$changeset_id.adiff.md5 + # also delete the stamp file + rm $CHANGESET_DIR//$changeset_id.adiff.md5 done diff --git a/merge.mk b/merge.mk index fa55e4a..c3bc23e 100755 --- a/merge.mk +++ b/merge.mk @@ -8,17 +8,26 @@ .ONESHELL: .SECONDEXPANSION: + +WORKDIR := /data +API_URL ?= https://api.openstreetmap.org + +SPLIT_DIR := $(WORKDIR)/stage-data/split-adiffs +CHANGESET_DIR := $(WORKDIR)/stage-data/changesets +BUCKET_DIR := $(WORKDIR)/bucket-data/changesets + + MAKEDIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) # all: metadatas changesets all: changesets -changesets: $(shell find stage-data/split-adiffs/ -mindepth 1 -type d -printf 'stage-data/changesets/%P.adiff.md5\n') +changesets: $(shell find $(SPLIT_DIR) -mindepth 1 -type d | sed 's|$(SPLIT_DIR)|$(CHANGESET_DIR)|g' | sed 's|$$|.adiff.md5|') # bucket-data/changesets/%.adiff: $$(wildcard stage-data/split-adiffs/%/*) -stage-data/changesets/%.adiff.md5: $$(wildcard stage-data/split-adiffs/%/*) +$(CHANGESET_DIR)/%.adiff.md5: $$(wildcard $(SPLIT_DIR)/%/*) tmpfile=$$(mktemp) - merge_adiffs.py $^ | xmlstarlet format > $$tmpfile + python merge_adiffs.py $^ | xmlstarlet format > $$tmpfile if [ -s $$tmpfile ]; then # merge_adiffs.py can fail if it is given no input files or if one or more # of its input files are not found. Either of these can happen if the input @@ -27,10 +36,10 @@ stage-data/changesets/%.adiff.md5: $$(wildcard stage-data/split-adiffs/%/*) # stamp file if the merged output file is nonempty (-s). md5sum < $$tmpfile > $@ gzip -c < $$tmpfile > $$tmpfile.gz - mv $$tmpfile.gz bucket-data/changesets/$*.adiff && rm $$tmpfile + mv $$tmpfile.gz $(BUCKET_DIR)/$*.adiff && rm $$tmpfile fi -metadatas: $(shell find stage-data/split-adiffs/ -mindepth 1 -type d -printf '%p/metadata.xml\n') +metadatas: $(shell find $(SPLIT_DIR)/ -mindepth 1 -type d | sed 's|$$|/metadata.xml|') -stage-data/split-adiffs/%/metadata.xml: - curl -sL https://api.openstreetmap.org/api/0.6/changeset/$* -o $@ +$(SPLIT_DIR)/%/metadata.xml: + curl -sL "$(API_URL)/api/0.6/changeset/$*" -o $@ diff --git a/process.sh b/process.sh index 7e0db1b..77babaf 100755 --- a/process.sh +++ b/process.sh @@ -11,19 +11,38 @@ # works (the scripts are installed into /mnt/osmcha/bin). In your own deployment you # should ensure that the scripts (specifically split_adiff.py, merge_adiff.py, and # merge.mk) are in your $PATH, because process.sh assumes they are. -export PATH=$PATH:/mnt/osmcha/bin -for adiff_file in $(find stage-data/replication-adiffs/ -type f); do +# export PATH=$PATH:/mnt/osmcha/bin # Is this really necesary, seems not +WORKDIR=/data +REPLICATION_ADIFFS=$WORKDIR/stage-data/replication-adiffs # make sure this same as in ./update.sh +SPLIT_ADIFFS=$WORKDIR/stage-data/split-adiffs +CHANGESET_DIR=$WORKDIR/stage-data/changesets +BUCKET_DIR=$WORKDIR/bucket-data/changesets +API_URL=${API_URL:-https://api.openstreetmap.org} + + +mkdir -p $SPLIT_ADIFFS $CHANGESET_DIR $BUCKET_DIR + +for adiff_file in $(find $REPLICATION_ADIFFS/ -type f); do seqno=$(basename -s .adiff $adiff_file) tmpdir=$(mktemp -d) # split the adiff file - split_adiff.py $adiff_file $tmpdir + python split_adiff.py $adiff_file $tmpdir + + # Check if adiff files are been generated + if [ -z "$(ls -A "$tmpdir"/*.adiff 2>/dev/null)" ]; then + echo "No .adiff files generated from $adiff_file — skipping" + rm -rf "$tmpdir" + continue + fi for file in $tmpdir/*.adiff; do changeset=$(basename -s .adiff $file) - mkdir -p stage-data/split-adiffs/$changeset/ - mv $file stage-data/split-adiffs/$changeset/$seqno.adiff + echo "Changeset $changeset has $seqno adiffs" + # move the adiff file into place + mkdir -p $$SPLIT_ADIFFS/$changeset/ + mv $file $$SPLIT_ADIFFS/$changeset/$seqno.adiff done rm -rf $tmpdir @@ -39,10 +58,10 @@ for adiff_file in $(find stage-data/replication-adiffs/ -type f); do rm $adiff_file done -# merge all our split files, potentially updating existing changesets. -# this is done using a makefile script in order to avoid needlessly reprocessing -# changesets whose set of input (split-adiffs/) files haven't changed. -merge.mk - -# clean up old stage-data that we don't need anymore -gc.sh +# # merge all our split files, potentially updating existing changesets. +# # this is done using a makefile script in order to avoid needlessly reprocessing +# # changesets whose set of input (split-adiffs/) files haven't changed. +# merge.mk +make -f merge.mk API_URL="$API_URL" +# # clean up old stage-data that we don't need anymore +./gc.sh diff --git a/update.sh b/update.sh index 14bcb23..3335fd3 100755 --- a/update.sh +++ b/update.sh @@ -14,12 +14,21 @@ set -ex export PATH=$PATH:/${PWD} +WORKDIR=/data +DEFAULT_OSMX_DB_PATH=$WORKDIR/db/osmx.db +DEFAULT_REPLICATION_ADIFFS=$WORKDIR/stage-data/replication-adiffs + +## Use $1 or s2 if passed, else default +OSMX_DB_PATH="${1:-$DEFAULT_OSMX_DB_PATH}" +REPLICATION_ADIFFS="${2:-$DEFAULT_REPLICATION_ADIFFS}" +mkdir -p $REPLICATION_ADIFFS + eval "$(mise activate bash --shims)" if [ -n "$INITIAL_SEQNUM" ]; then seqno_start="$INITIAL_SEQNUM" else - seqno_start="$(osmx query "$1" seqnum)" + seqno_start="$(osmx query "$OSMX_DB_PATH" seqnum)" fi osm replication minute --seqno $seqno_start \ @@ -29,9 +38,9 @@ osm replication minute --seqno $seqno_start \ curl -sL $url | gzip -d > $seqno.osc tmpfile=$(mktemp) - augmented_diff.py $1 $seqno.osc | xmlstarlet format > $tmpfile - mv $tmpfile $2/$seqno.adiff + augmented_diff.py $OSMX_DB_PATH $seqno.osc | xmlstarlet format > $tmpfile + mv $tmpfile $REPLICATION_ADIFFS/$seqno.adiff - osmx update $1 $seqno.osc $seqno $timestamp --commit + osmx update $OSMX_DB_PATH $seqno.osc $seqno $timestamp --commit rm $seqno.osc done From 6a67d544187c929ac99f7b14d33c9a5cd09d74fe Mon Sep 17 00:00:00 2001 From: Rub21 Date: Thu, 7 Aug 2025 13:13:43 -0500 Subject: [PATCH 4/5] Update scripts to pass custom arguments --- gc.sh | 11 ++++++----- merge.mk | 17 +++++++---------- process.sh | 41 ++++++++++++++++++++++++++++------------- update.sh | 16 ++++++---------- 4 files changed, 47 insertions(+), 38 deletions(-) diff --git a/gc.sh b/gc.sh index 44c358b..7174465 100755 --- a/gc.sh +++ b/gc.sh @@ -6,9 +6,10 @@ # The openstreetmap.org server automatically closes changesets after 24h, # so if a changeset hasn't been modified in at least that long, we can # safely assume that it won't change again in the future. -WORKDIR=/data -CHANGESET_DIR=$WORKDIR/stage-data/changesets -SPLIT_ADIFFS=$WORKDIR/stage-data/split-adiffs + +SPLIT_ADIFFS_DIR=${1:-'stage-data/split-adiffs'} +CHANGESET_DIR=${2:-'stage-data/changesets'} + find "$CHANGESET_DIR" -type f -name "*.adiff.md5" -mtime +3 | while read stampfile; do changeset_id=$(basename "$stampfile" .adiff.md5) @@ -17,9 +18,9 @@ find "$CHANGESET_DIR" -type f -name "*.adiff.md5" -mtime +3 | while read stampfi # them (prevents merge_adiffs.py being run during the deletion, which could # result in an incomplete adiff being generated) tmpdir=$(mktemp -d) - mv $SPLIT_ADIFFS/$changeset_id/ $tmpdir + mv $SPLIT_ADIFFS_DIR/$changeset_id/ $tmpdir rm -rf $tmpdir # also delete the stamp file - rm $CHANGESET_DIR//$changeset_id.adiff.md5 + rm $CHANGESET_DIR/$changeset_id.adiff.md5 done diff --git a/merge.mk b/merge.mk index c3bc23e..a958a6a 100755 --- a/merge.mk +++ b/merge.mk @@ -8,24 +8,21 @@ .ONESHELL: .SECONDEXPANSION: - -WORKDIR := /data +SPLIT_ADIFFS_DIR ?= stage-data/split-adiffs +CHANGESET_DIR ?= stage-data/changesets +BUCKET_DIR ?= bucket-data/replication/minute API_URL ?= https://api.openstreetmap.org -SPLIT_DIR := $(WORKDIR)/stage-data/split-adiffs -CHANGESET_DIR := $(WORKDIR)/stage-data/changesets -BUCKET_DIR := $(WORKDIR)/bucket-data/changesets - MAKEDIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST)))) # all: metadatas changesets all: changesets -changesets: $(shell find $(SPLIT_DIR) -mindepth 1 -type d | sed 's|$(SPLIT_DIR)|$(CHANGESET_DIR)|g' | sed 's|$$|.adiff.md5|') +changesets: $(shell find $(SPLIT_ADIFFS_DIR)/ -mindepth 1 -type d | sed 's|$(SPLIT_ADIFFS_DIR)|$(CHANGESET_DIR)/|g' | sed 's|$$|.adiff.md5|') # bucket-data/changesets/%.adiff: $$(wildcard stage-data/split-adiffs/%/*) -$(CHANGESET_DIR)/%.adiff.md5: $$(wildcard $(SPLIT_DIR)/%/*) +$(CHANGESET_DIR)/%.adiff.md5: $$(wildcard $(SPLIT_ADIFFS_DIR)/%/*) tmpfile=$$(mktemp) python merge_adiffs.py $^ | xmlstarlet format > $$tmpfile if [ -s $$tmpfile ]; then @@ -39,7 +36,7 @@ $(CHANGESET_DIR)/%.adiff.md5: $$(wildcard $(SPLIT_DIR)/%/*) mv $$tmpfile.gz $(BUCKET_DIR)/$*.adiff && rm $$tmpfile fi -metadatas: $(shell find $(SPLIT_DIR)/ -mindepth 1 -type d | sed 's|$$|/metadata.xml|') +metadatas: $(shell find $(SPLIT_ADIFFS_DIR)/ -mindepth 1 -type d | sed 's|$$|/metadata.xml|') -$(SPLIT_DIR)/%/metadata.xml: +$(SPLIT_ADIFFS_DIR)/%/metadata.xml: curl -sL "$(API_URL)/api/0.6/changeset/$*" -o $@ diff --git a/process.sh b/process.sh index 77babaf..671cb81 100755 --- a/process.sh +++ b/process.sh @@ -1,6 +1,6 @@ #!/bin/sh -# Usage: process.sh +# Usage: process.sh [replication_dir] [split_dir] [changeset_dir] [bucket_dir] [api_url] [minutes_filter] # # Does the following: # - splits all adiffs in stage-data/replication-adiffs/*.adiff into stage-data/split-adiffs/*/ @@ -13,17 +13,26 @@ # merge.mk) are in your $PATH, because process.sh assumes they are. # export PATH=$PATH:/mnt/osmcha/bin # Is this really necesary, seems not -WORKDIR=/data -REPLICATION_ADIFFS=$WORKDIR/stage-data/replication-adiffs # make sure this same as in ./update.sh -SPLIT_ADIFFS=$WORKDIR/stage-data/split-adiffs -CHANGESET_DIR=$WORKDIR/stage-data/changesets -BUCKET_DIR=$WORKDIR/bucket-data/changesets -API_URL=${API_URL:-https://api.openstreetmap.org} +REPLICATION_ADIFFS_DIR=${1:-'stage-data/replication-adiffs'} +SPLIT_ADIFFS_DIR=${2:-'stage-data/split-adiffs'} +CHANGESET_DIR=${3:-'stage-data/changesets'} +BUCKET_DIR=${4:-'bucket-data/replication/minute'} +API_URL=${5:-'https://api.openstreetmap.org'} +FILTER_ADIFF_FILES=${6:-''} -mkdir -p $SPLIT_ADIFFS $CHANGESET_DIR $BUCKET_DIR -for adiff_file in $(find $REPLICATION_ADIFFS/ -type f); do +# Determine which .adiff files to process +if [ -n "$FILTER_ADIFF_FILES" ]; then + echo "Filtering .adiff files modified in the last $FILTER_ADIFF_FILES minutes..." + adiff_files=$(find "$REPLICATION_ADIFFS_DIR" -type f -mmin -"$FILTER_ADIFF_FILES") +else + echo "Processing all .adiff files in $REPLICATION_ADIFFS_DIR..." + adiff_files=$(find "$REPLICATION_ADIFFS_DIR" -type f) +fi + +for adiff_file in $adiff_files; do + seqno=$(basename -s .adiff $adiff_file) tmpdir=$(mktemp -d) @@ -41,8 +50,8 @@ for adiff_file in $(find $REPLICATION_ADIFFS/ -type f); do changeset=$(basename -s .adiff $file) echo "Changeset $changeset has $seqno adiffs" # move the adiff file into place - mkdir -p $$SPLIT_ADIFFS/$changeset/ - mv $file $$SPLIT_ADIFFS/$changeset/$seqno.adiff + mkdir -p $SPLIT_ADIFFS_DIR/$changeset/ + mv $file $SPLIT_ADIFFS_DIR/$changeset/$seqno.adiff done rm -rf $tmpdir @@ -62,6 +71,12 @@ done # # this is done using a makefile script in order to avoid needlessly reprocessing # # changesets whose set of input (split-adiffs/) files haven't changed. # merge.mk -make -f merge.mk API_URL="$API_URL" +make -f merge.mk \ + REPLICATION_ADIFFS_DIR="${REPLICATION_ADIFFS_DIR}" \ + SPLIT_ADIFFS_DIR="${SPLIT_ADIFFS_DIR}" \ + CHANGESET_DIR="${CHANGESET_DIR}" \ + BUCKET_DIR="${BUCKET_DIR}" \ + API_URL="${API_URL}" + # # clean up old stage-data that we don't need anymore -./gc.sh +./gc.sh "${SPLIT_ADIFFS_DIR}" "${CHANGESET_DIR}" diff --git a/update.sh b/update.sh index 3335fd3..56bb969 100755 --- a/update.sh +++ b/update.sh @@ -11,17 +11,13 @@ # should run this script in a cron job once per minute. It is good practice # to use flock to ensure that only one instance of the job is running at a time. -set -ex +set -e export PATH=$PATH:/${PWD} -WORKDIR=/data -DEFAULT_OSMX_DB_PATH=$WORKDIR/db/osmx.db -DEFAULT_REPLICATION_ADIFFS=$WORKDIR/stage-data/replication-adiffs - -## Use $1 or s2 if passed, else default -OSMX_DB_PATH="${1:-$DEFAULT_OSMX_DB_PATH}" -REPLICATION_ADIFFS="${2:-$DEFAULT_REPLICATION_ADIFFS}" -mkdir -p $REPLICATION_ADIFFS +## Initial sequence number — this is only required at the beginning. right after the osmx database is created +OSMX_DB_PATH=$1 +REPLICATION_ADIFFS_DIR=$2 +INITIAL_SEQNUM=$3 eval "$(mise activate bash --shims)" @@ -39,7 +35,7 @@ osm replication minute --seqno $seqno_start \ tmpfile=$(mktemp) augmented_diff.py $OSMX_DB_PATH $seqno.osc | xmlstarlet format > $tmpfile - mv $tmpfile $REPLICATION_ADIFFS/$seqno.adiff + mv $tmpfile $REPLICATION_ADIFFS_DIR/$seqno.adiff osmx update $OSMX_DB_PATH $seqno.osc $seqno $timestamp --commit rm $seqno.osc From 4e04bdfb247b6184fae1f003b798ca02192f9ef1 Mon Sep 17 00:00:00 2001 From: Rub21 Date: Thu, 7 Aug 2025 20:17:14 -0500 Subject: [PATCH 5/5] Fix issue with lon and lat when those are None --- augmented_diff.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/augmented_diff.py b/augmented_diff.py index 5560d84..71c4d21 100755 --- a/augmented_diff.py +++ b/augmented_diff.py @@ -498,7 +498,10 @@ def elem(self): if nds: bounds = Bounds() for nd in nds: - bounds.add(float(nd.get("lon")), float(nd.get("lat"))) + lon = nd.get("lon") + lat = nd.get("lat") + if lon is not None and lat is not None: + bounds.add(float(nd.get("lon")), float(nd.get("lat"))) osm_obj.insert(0, bounds.elem()) eprint(f"Pass 5: {time.time() - pass_5_start_time:.3f}s")