OSMCha · Rub21 · Aug 6, 2025 · Aug 6, 2025 · Aug 6, 2025 · Aug 7, 2025
diff --git a/augmented_diff.py b/augmented_diff.py
@@ -498,7 +498,10 @@ def elem(self):
         if nds:
             bounds = Bounds()
             for nd in nds:
-                bounds.add(float(nd.get("lon")), float(nd.get("lat")))
+                lon = nd.get("lon")
+                lat = nd.get("lat")
+                if lon is not None and lat is not None:
+                    bounds.add(float(nd.get("lon")), float(nd.get("lat")))
             osm_obj.insert(0, bounds.elem())
 
 eprint(f"Pass 5: {time.time() - pass_5_start_time:.3f}s")

diff --git a/gc.sh b/gc.sh
@@ -7,17 +7,20 @@
 # so if a changeset hasn't been modified in at least that long, we can
 # safely assume that it won't change again in the future.
 
-find stage-data/changesets/ -type f -mtime +3 -printf '%P\n' \
-  | cut -d '.' -f1 \
-  | while read changeset_id; do
-    echo "removing files for changeset $changeset_id"
-    # atomically move the split files to a temporary location before deleting
-    # them (prevents merge_adiffs.py being run during the deletion, which could
-    # result in an incomplete adiff being generated)
-    tmpdir=$(mktemp -d)
-    mv stage-data/split-adiffs/$changeset_id/ $tmpdir
-    rm -rf $tmpdir
+SPLIT_ADIFFS_DIR=${1:-'stage-data/split-adiffs'}
+CHANGESET_DIR=${2:-'stage-data/changesets'}
 
-    # also delete the stamp file
-    rm stage-data/changesets/$changeset_id.adiff.md5
+
+find "$CHANGESET_DIR" -type f -name "*.adiff.md5" -mtime +3 | while read stampfile; do
+  changeset_id=$(basename "$stampfile" .adiff.md5)
+  echo "removing files for changeset $changeset_id"
+  # atomically move the split files to a temporary location before deleting
+  # them (prevents merge_adiffs.py being run during the deletion, which could
+  # result in an incomplete adiff being generated)
+  tmpdir=$(mktemp -d)
+  mv $SPLIT_ADIFFS_DIR/$changeset_id/ $tmpdir
+  rm -rf $tmpdir
+
+  # also delete the stamp file
+  rm $CHANGESET_DIR/$changeset_id.adiff.md5
 done
diff --git a/merge.mk b/merge.mk
@@ -8,17 +8,23 @@
 .ONESHELL:
 .SECONDEXPANSION:
 
+SPLIT_ADIFFS_DIR ?= stage-data/split-adiffs
+CHANGESET_DIR ?= stage-data/changesets
+BUCKET_DIR ?= bucket-data/replication/minute
+API_URL ?= https://api.openstreetmap.org
+
+
 MAKEDIR := $(dir $(realpath $(lastword $(MAKEFILE_LIST))))
 
 # all: metadatas changesets
 all: changesets
 
-changesets: $(shell find stage-data/split-adiffs/ -mindepth 1 -type d -printf 'stage-data/changesets/%P.adiff.md5\n')
+changesets: $(shell find $(SPLIT_ADIFFS_DIR)/ -mindepth 1 -type d | sed 's|$(SPLIT_ADIFFS_DIR)|$(CHANGESET_DIR)/|g' | sed 's|$$|.adiff.md5|')
 
 # bucket-data/changesets/%.adiff: $$(wildcard stage-data/split-adiffs/%/*)
-stage-data/changesets/%.adiff.md5: $$(wildcard stage-data/split-adiffs/%/*)
+$(CHANGESET_DIR)/%.adiff.md5: $$(wildcard $(SPLIT_ADIFFS_DIR)/%/*)
 	tmpfile=$$(mktemp)
-	merge_adiffs.py $^ | xmlstarlet format > $$tmpfile
+	python merge_adiffs.py $^ | xmlstarlet format > $$tmpfile
 	if [ -s $$tmpfile ]; then
 		# merge_adiffs.py can fail if it is given no input files or if one or more
 		# of its input files are not found. Either of these can happen if the input
@@ -27,10 +33,10 @@ stage-data/changesets/%.adiff.md5: $$(wildcard stage-data/split-adiffs/%/*)
 		# stamp file if the merged output file is nonempty (-s).
 		md5sum < $$tmpfile > $@
 		gzip -c < $$tmpfile > $$tmpfile.gz
-		mv $$tmpfile.gz bucket-data/changesets/$*.adiff && rm $$tmpfile
+		mv $$tmpfile.gz $(BUCKET_DIR)/$*.adiff && rm $$tmpfile
 	fi
 
-metadatas: $(shell find stage-data/split-adiffs/ -mindepth 1 -type d -printf '%p/metadata.xml\n')
+metadatas: $(shell find $(SPLIT_ADIFFS_DIR)/ -mindepth 1 -type d | sed 's|$$|/metadata.xml|')
 
-stage-data/split-adiffs/%/metadata.xml:
-	curl -sL https://api.openstreetmap.org/api/0.6/changeset/$* -o $@
+$(SPLIT_ADIFFS_DIR)/%/metadata.xml:
+	curl -sL "$(API_URL)/api/0.6/changeset/$*" -o $@
diff --git a/process.sh b/process.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 
-# Usage: process.sh
+# Usage: process.sh [replication_dir] [split_dir] [changeset_dir] [bucket_dir] [api_url] [minutes_filter]
 # 
 # Does the following:
 # - splits all adiffs in stage-data/replication-adiffs/*.adiff into stage-data/split-adiffs/*/
@@ -11,19 +11,47 @@
 # works (the scripts are installed into /mnt/osmcha/bin). In your own deployment you
 # should ensure that the scripts (specifically split_adiff.py, merge_adiff.py, and
 # merge.mk) are in your $PATH, because process.sh assumes they are.
-export PATH=$PATH:/mnt/osmcha/bin
 
-for adiff_file in $(find stage-data/replication-adiffs/ -type f); do
+# export PATH=$PATH:/mnt/osmcha/bin # Is this really necesary, seems not
+
+REPLICATION_ADIFFS_DIR=${1:-'stage-data/replication-adiffs'}
+SPLIT_ADIFFS_DIR=${2:-'stage-data/split-adiffs'}
+CHANGESET_DIR=${3:-'stage-data/changesets'}
+BUCKET_DIR=${4:-'bucket-data/replication/minute'}
+API_URL=${5:-'https://api.openstreetmap.org'}
+FILTER_ADIFF_FILES=${6:-''}
+
+
+# Determine which .adiff files to process
+if [ -n "$FILTER_ADIFF_FILES" ]; then
+  echo "Filtering .adiff files modified in the last $FILTER_ADIFF_FILES minutes..."
+  adiff_files=$(find "$REPLICATION_ADIFFS_DIR" -type f -mmin -"$FILTER_ADIFF_FILES")
+else
+  echo "Processing all .adiff files in $REPLICATION_ADIFFS_DIR..."
+  adiff_files=$(find "$REPLICATION_ADIFFS_DIR" -type f)
+fi
+
+for adiff_file in $adiff_files; do
+
   seqno=$(basename -s .adiff $adiff_file)
   tmpdir=$(mktemp -d)
 
   # split the adiff file
-  split_adiff.py $adiff_file $tmpdir
+  python split_adiff.py $adiff_file $tmpdir
+
+  # Check if adiff files  are been generated
+  if [ -z "$(ls -A "$tmpdir"/*.adiff 2>/dev/null)" ]; then
+    echo "No .adiff files generated from $adiff_file — skipping"
+    rm -rf "$tmpdir"
+    continue
+  fi
 
   for file in $tmpdir/*.adiff; do
     changeset=$(basename -s .adiff $file)
-    mkdir -p stage-data/split-adiffs/$changeset/
-    mv $file stage-data/split-adiffs/$changeset/$seqno.adiff
+    echo "Changeset $changeset has $seqno adiffs"
+    # move the adiff file into place
+    mkdir -p $SPLIT_ADIFFS_DIR/$changeset/
+    mv $file $SPLIT_ADIFFS_DIR/$changeset/$seqno.adiff
   done
 
   rm -rf $tmpdir
@@ -39,10 +67,16 @@ for adiff_file in $(find stage-data/replication-adiffs/ -type f); do
   rm $adiff_file
 done
 
-# merge all our split files, potentially updating existing changesets.
-# this is done using a makefile script in order to avoid needlessly reprocessing
-# changesets whose set of input (split-adiffs/) files haven't changed.
-merge.mk
+# # merge all our split files, potentially updating existing changesets.
+# # this is done using a makefile script in order to avoid needlessly reprocessing
+# # changesets whose set of input (split-adiffs/) files haven't changed.
+# merge.mk
+make -f merge.mk \
+  REPLICATION_ADIFFS_DIR="${REPLICATION_ADIFFS_DIR}" \
+  SPLIT_ADIFFS_DIR="${SPLIT_ADIFFS_DIR}" \
+  CHANGESET_DIR="${CHANGESET_DIR}" \
+  BUCKET_DIR="${BUCKET_DIR}" \
+  API_URL="${API_URL}"
 
-# clean up old stage-data that we don't need anymore
-gc.sh
+# # clean up old stage-data that we don't need anymore
+./gc.sh "${SPLIT_ADIFFS_DIR}" "${CHANGESET_DIR}"
diff --git a/update.sh b/update.sh
@@ -11,21 +11,32 @@
 # should run this script in a cron job once per minute. It is good practice
 # to use flock to ensure that only one instance of the job is running at a time.
 
-set -ex
-export PATH=$PATH:/home/osmx/osmx-adiff-builder
+set -e
+export PATH=$PATH:/${PWD}
+
+## Initial sequence number — this is only required at the beginning. right after the osmx database is created
+OSMX_DB_PATH=$1
+REPLICATION_ADIFFS_DIR=$2
+INITIAL_SEQNUM=$3
 
 eval "$(mise activate bash --shims)"
 
-osm replication minute --seqno $(osmx query $1 seqnum) \
+if [ -n "$INITIAL_SEQNUM" ]; then
+  seqno_start="$INITIAL_SEQNUM"
+else
+  seqno_start="$(osmx query "$OSMX_DB_PATH" seqnum)"
+fi
+
+osm replication minute --seqno $seqno_start \
   | while read seqno timestamp url; do
   test -z "$seqno" && continue # skip blank lines or empty output
 
   curl -sL $url | gzip -d > $seqno.osc
   tmpfile=$(mktemp)
 
-  augmented_diff.py $1 $seqno.osc | xmlstarlet format > $tmpfile
-  mv $tmpfile $2/$seqno.adiff
+  augmented_diff.py $OSMX_DB_PATH $seqno.osc | xmlstarlet format > $tmpfile
+  mv $tmpfile $REPLICATION_ADIFFS_DIR/$seqno.adiff
 
-  osmx update $1 $seqno.osc $seqno $timestamp --commit
+  osmx update $OSMX_DB_PATH $seqno.osc $seqno $timestamp --commit
   rm $seqno.osc
 done