Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,21 @@

namespace PKP\migration\upgrade\v3_4_0;

use Illuminate\Bus\Batch;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Bus;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Schema;
use PKP\core\Core;
use PKP\install\DowngradeNotSupportedException;
use PKP\migration\Migration;
use PKP\migration\upgrade\v3_4_0\jobs\FixRegionCodes;
use PKP\migration\upgrade\v3_4_0\jobs\CleanTmpChangesForRegionCodesFixes;
use PKP\migration\upgrade\v3_4_0\jobs\FixRegionCodesDaily;
use PKP\migration\upgrade\v3_4_0\jobs\FixRegionCodesMonthly;
use PKP\migration\upgrade\v3_4_0\jobs\PreFixRegionCodesDaily;
use PKP\migration\upgrade\v3_4_0\jobs\PreFixRegionCodesMonthly;
use PKP\migration\upgrade\v3_4_0\jobs\RegionMappingTmpClear;
use PKP\migration\upgrade\v3_4_0\jobs\RegionMappingTmpInsert;
use Throwable;

class I8866_DispatchRegionCodesFixingJobs extends Migration
{
Expand All @@ -31,8 +37,9 @@ class I8866_DispatchRegionCodesFixingJobs extends Migration
*/
public function up(): void
{
if (DB::table('metrics_submission_geo_monthly')->whereNotNull('region')->exists() ||
DB::table('metrics_submission_geo_daily')->whereNotNull('region')->exists()) {
if (DB::table('metrics_submission_geo_monthly')->where('region', '<>', '')->exists() ||
DB::table('metrics_submission_geo_daily')->where('region', '<>', '')->exists()) {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GitHub doesn't allow me to comment on the lines below, but adding an index to the temporary table's columns might be helpful to performance.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think index would be needed even if the table only contains a few entries, lets say max 100 ?

// create a temporary table for the FIPS-ISO mapping
if (!Schema::hasTable('region_mapping_tmp')) {
Schema::create('region_mapping_tmp', function (Blueprint $table) {
Expand All @@ -42,6 +49,14 @@ public function up(): void
});
}

// temporary change the length of the region columns, becuase we will add prefix 'pkp-'
Schema::table('metrics_submission_geo_daily', function (Blueprint $table) {
$table->string('region', 7)->change();
});
Schema::table('metrics_submission_geo_monthly', function (Blueprint $table) {
$table->string('region', 7)->change();
});
Comment on lines +53 to +58
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounds risky, if the job never gets completed, the database will be left with an unexpected structure.

The same for the indexes below.


// temporary create index on the column country and region, in order to be able to update the region codes in a reasonable time
Schema::table('metrics_submission_geo_daily', function (Blueprint $table) {
$sm = Schema::getConnection()->getDoctrineSchemaManager();
Expand All @@ -58,35 +73,45 @@ public function up(): void
}
});

$geoDailyIdMax = DB::table('metrics_submission_geo_daily')
->max('metrics_submission_geo_daily_id');
$geoMonthlyIdMax = DB::table('metrics_submission_geo_monthly')
->max('metrics_submission_geo_monthly_id');

$chunkSize = 100000;
$geoDailyChunksNo = ceil($geoDailyIdMax / $chunkSize);
$geoMonthlyChunksNo = ceil($geoMonthlyIdMax / $chunkSize);

// read the FIPS to ISO mappings and displatch a job per country
$mappings = include Core::getBaseDir() . '/' . PKP_LIB_PATH . '/lib/regionMapping.php';
$jobs = [];
foreach (array_keys($mappings) as $country) {
$jobs[] = new FixRegionCodes($country);
$jobs[] = new RegionMappingTmpInsert($country);
for ($i = 0; $i < $geoDailyChunksNo; $i++) {
$startId = ($i * $chunkSize) + 1;
$endId = min(($i + 1) * $chunkSize, $geoDailyIdMax);
$jobs[] = new PreFixRegionCodesDaily($startId, $endId);
}
for ($i = 0; $i < $geoMonthlyChunksNo; $i++) {
$startId = ($i * $chunkSize) + 1;
$endId = min(($i + 1) * $chunkSize, $geoMonthlyIdMax);
$jobs[] = new PreFixRegionCodesMonthly($startId, $endId);
}
for ($i = 0; $i < $geoDailyChunksNo; $i++) {
$startId = ($i * $chunkSize) + 1;
$endId = min(($i + 1) * $chunkSize, $geoDailyIdMax);
$jobs[] = new FixRegionCodesDaily($startId, $endId);
}
for ($i = 0; $i < $geoMonthlyChunksNo; $i++) {
$startId = ($i * $chunkSize) + 1;
$endId = min(($i + 1) * $chunkSize, $geoMonthlyIdMax);
$jobs[] = new FixRegionCodesMonthly($startId, $endId);
}
$jobs[] = new RegionMappingTmpClear();
}

Bus::batch($jobs)
->then(function (Batch $batch) {
// drop the temporary index
Schema::table('metrics_submission_geo_daily', function (Blueprint $table) {
$sm = Schema::getConnection()->getDoctrineSchemaManager();
$indexesFound = $sm->listTableIndexes('metrics_submission_geo_daily');
if (array_key_exists('metrics_submission_geo_daily_tmp_index', $indexesFound)) {
$table->dropIndex(['tmp']);
}
});
Schema::table('metrics_submission_geo_monthly', function (Blueprint $table) {
$sm = Schema::getConnection()->getDoctrineSchemaManager();
$indexesFound = $sm->listTableIndexes('metrics_submission_geo_monthly');
if (array_key_exists('metrics_submission_geo_monthly_tmp_index', $indexesFound)) {
$table->dropIndex(['tmp']);
}
});

// drop the temporary table
if (Schema::hasTable('region_mapping_tmp')) {
Schema::drop('region_mapping_tmp');
}
$jobs[] = new CleanTmpChangesForRegionCodesFixes();
Bus::chain($jobs)
->catch(function (Throwable $e) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be some error logging here, the Throwable covers even syntax errors at included files.

})
->dispatch();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
<?php

/**
* @file classes/migration/upgrade/v3_4_0/jobs/CleanTmpChangesForRegionCodesFixes.php
*
* Copyright (c) 2022-2025 Simon Fraser University
* Copyright (c) 2022-2025 John Willinsky
Comment on lines +6 to +7
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For new files we're keeping just the initial year

Suggested change
* Copyright (c) 2022-2025 Simon Fraser University
* Copyright (c) 2022-2025 John Willinsky
* Copyright (c) 2025 Simon Fraser University
* Copyright (c) 2025 John Willinsky

* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class CleanTmpChangesForRegionCodesFixes
*
* @ingroup jobs
*
* @brief Drop temporary indexes on metrics Geo tables, drop the table region_mapping_tmp, and return the region column size
*/

namespace PKP\migration\upgrade\v3_4_0\jobs;

use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;
use PKP\jobs\BaseJob;

class CleanTmpChangesForRegionCodesFixes extends BaseJob
{
/**
* Execute the job.
*/
public function handle(): void
{
// drop the temporary index
Schema::table('metrics_submission_geo_daily', function (Blueprint $table) {
$sm = Schema::getConnection()->getDoctrineSchemaManager();
$indexesFound = $sm->listTableIndexes('metrics_submission_geo_daily');
if (array_key_exists('metrics_submission_geo_daily_tmp_index', $indexesFound)) {
$table->dropIndex(['tmp']);
}
});
Schema::table('metrics_submission_geo_monthly', function (Blueprint $table) {
$sm = Schema::getConnection()->getDoctrineSchemaManager();
$indexesFound = $sm->listTableIndexes('metrics_submission_geo_monthly');
if (array_key_exists('metrics_submission_geo_monthly_tmp_index', $indexesFound)) {
$table->dropIndex(['tmp']);
}
});

// return the length of the region columns to varchar(3)
Schema::table('metrics_submission_geo_daily', function (Blueprint $table) {
$table->string('region', 3)->change();
});
Schema::table('metrics_submission_geo_monthly', function (Blueprint $table) {
$table->string('region', 3)->change();
});
Comment on lines +47 to +52
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think jobs shouldn't touch the structure of the database.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonasraoni, in principle I agree with you -- but I'm not sure I can think of a better alternative. Our options (as I see them) are...

  • Move this back into the upgrade script -- at the potential cost of the upgrade taking an extremely long time to run processing old logs that are not critical to the system's ongoing operation. (Shortening the upgrade was the original impetus for doing it here.)
  • Keep this as a job, but make sure there are good safeguards against double-execution -- e.g. use the existence of an extra index or prefix as an indication that the job has already been tried (and perhaps failed), like a mutex.
  • Keep this as a job, but find a way to do it without modifying the schema. Unfortunately I couldn't see a way to do this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know how long it may take to run this script on a really heavy installation? If it's not that long, I think it's ok to keep it as a standard upgrade script.
Otherwise we have to make it very clear to the user that part of the upgrade has been deferred and that they must take a look at the status of post-upgrade jobs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My concern with these queued jobs and background tasks, especially if they are upgrade related, are with installations hosted on potentially less than ideal hosting solutions where users do not have access to server side logs or shell access. Obviously we do not recommend these sorts of scenarios but given how complex this is, even for us, I can imagine users wondering what's up.

As an aside, what does this region code bug cause? If these scripts fail to run, how does one know? Are stats misattributed or not available?

Copy link
Contributor

@jonasraoni jonasraoni Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I started the review with the opinion that this should first be turned into an upgrade script, I'll take a look again and check if I have ideas to share (regarding keeping it as a job).

For now:

  1. I'll assume that users with a failed job will probably have to live with the corrupted data, I mean, it's already too late to revert (the file was created in 03/2023), and looks like reconciling is not possible, right?
  2. In general, I think we should drop "default timeouts" from jobs... All tasks are supposed to be completed, unless there's an infinite loop (which I prefer to handle with another type of protection)... Following this idea, a job that can be retried as much as it needs (as long as there's progress), until the task is completed sounds nice, instead of just creating one gazillion jobs that will consume the whole processing.
  3. While the job is running the updates, more entries might be inserted by the application, so we need to ensure new data isn't going to be affected/corrupted.

Copy link
Contributor Author

@bozana bozana Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi all, this jobs/upgrade script changes the old region codes into new ones. I do not think that lots of users are using regions -- I think they rarely want to have that detailed usage statistics. The tracking of such usage is also resource intensive, so I would even think that the installations with not so much capacity (resources and personal) would not use them. (I am also wondering why don't we remove city and region level for usage stats -- I believe having country is enough for most of users, no?).
I introduced the bug, with not considering that the jobs cannot be re-run. These users, who already applied it and the jobs failed (probably only with big installations running jobs within a web request), will have some corrupt data -- wrong region codes. But, it is maybe not SUCH a big deal.
I now changed the jobs so that they are run only on part of the DB rows (in batches), so that no timeout occurs for huge installations. I also change the jobs so that the prefix is added to the regions that need to be changed, so this will assure that new regions are not affected. And the jobs are chained, so that they cannot run in parallel, so that the temporary table can not grow, and lead to timeouts.

I do not fully understand No. 2 from Jonas. I think this belongs somewhere else, how we generally deal with jobs?

Because this upgrade is probably not so important for most of the users, and the upgrade to 3.4 lasted in some of Alec's tests hours, we decided to put it into jobs back then.

So I believe, these changes make it pretty safe -- that the jobs would not fail (because of the timeout) and can be re-run. So I believe, it would be good to review it all having this in mind -- to be sure that this is so. And, in that case, why not leave it in jobs? Else, the only possibility would be to move it back to the normal process, but because it is so 'unimportant' this maybe does not make soooo much sense now -- but this decision I would need to leave to you...

Copy link
Contributor Author

@bozana bozana Oct 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As an aside, what does this region code bug cause? If these scripts fail to run, how does one know? Are stats misattributed or not available?

The metrics will then exist for the region 01 instead for region 10 (not real region codes, just an example). And because we do not do anything with the regions and their metrics, I do not think anybody will notice that difference. Also, not all regions are affected, just some.
Generally, we currently do nothing with Geo statistics. They are only visible in TSV reports.
The numbers for the countries will still be valid.
And the numbers for article usage that we show on the UI, as well for COUNTER reports are all correct, because Geo stats are fully separate, in separate DB tables.


// drop the temporary table
if (Schema::hasTable('region_mapping_tmp')) {
Schema::drop('region_mapping_tmp');
}
}
}
93 changes: 0 additions & 93 deletions classes/migration/upgrade/v3_4_0/jobs/FixRegionCodes.php

This file was deleted.

67 changes: 67 additions & 0 deletions classes/migration/upgrade/v3_4_0/jobs/FixRegionCodesDaily.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<?php

/**
* @file classes/migration/upgrade/v3_4_0/jobs/FixRegionCodesDaily.php
*
* Copyright (c) 2022-2025 Simon Fraser University
* Copyright (c) 2022-2025 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class FixRegionCodesDaily
*
* @ingroup jobs
*
* @brief Fixes the wrong region codes, using the temporary table of FIPS-ISO mapping.
*/

namespace PKP\migration\upgrade\v3_4_0\jobs;

use Illuminate\Support\Facades\DB;
use PKP\config\Config;
use PKP\jobs\BaseJob;

class FixRegionCodesDaily extends BaseJob
{
/** The range of metrics_submission_geo_daily_ids to consider for this update */
protected int $startId;
protected int $endId;

/**
* Create a new job instance.
*/
public function __construct(int $startId, int $endId)
{
parent::__construct();
$this->startId = $startId;
$this->endId = $endId;
}
Comment on lines +25 to +37
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this isn't going to be extended, it can be private and it's possible to simplify the initialization

Suggested change
/** The range of metrics_submission_geo_daily_ids to consider for this update */
protected int $startId;
protected int $endId;
/**
* Create a new job instance.
*/
public function __construct(int $startId, int $endId)
{
parent::__construct();
$this->startId = $startId;
$this->endId = $endId;
}
/**
* Create a new job instance.
* $startId and $endId provide the range of metrics_submission_geo_daily_ids to consider for this update
*/
public function __construct(private int $startId, private int $endId)
{
parent::__construct();
}


/**
* Execute the job.
*/
public function handle(): void
{
// update region code from FIPS to ISP, according to the entries in the table region_mapping_tmp
// Laravel join+update does not work well with PostgreSQL, so use the direct SQLs
// daily
if (substr(Config::getVar('database', 'driver'), 0, strlen('postgres')) === 'postgres') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should be possible to build this update and the others using the Laravel query builder.

Copy link
Contributor Author

@bozana bozana Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Back then, some years ago, I tried but it was not working with inner join, I will try again...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I remember about something like that... The updateFrom() should probably help you, something like:

if (DB::connection() instanceof PostgresConnection) {
    $updateQuery->updateFrom($update);
} else {
    $updateQuery->update($update);
}

DB::statement("
UPDATE metrics_submission_geo_daily AS gd
SET region = rm.iso
FROM region_mapping_tmp AS rm
WHERE gd.country = rm.country
AND gd.region = 'pkp-' || rm.fips AND
gd.metrics_submission_geo_daily_id >= {$this->startId}
gd.metrics_submission_geo_daily_id <= {$this->endId}
");
} else {
DB::statement("
UPDATE metrics_submission_geo_daily gd
INNER JOIN region_mapping_tmp rm ON (rm.country = gd.country AND CONCAT('pkp-', rm.fips) = gd.region)
SET gd.region = rm.iso
WHERE gd.metrics_submission_geo_daily_id >= {$this->startId} AND
gd.metrics_submission_geo_daily_id <= {$this->endId}
");
}
}
}
Loading