diff --git a/definitions/output/reports/reports_dynamic.js b/definitions/output/reports/reports_dynamic.js index 70c5a7b..a4c8b9d 100644 --- a/definitions/output/reports/reports_dynamic.js +++ b/definitions/output/reports/reports_dynamic.js @@ -1,107 +1,215 @@ -const configs = new reports.HTTPArchiveReports() -const metrics = configs.listMetrics() +/** + * Dynamic Reports Generator + * + * This file automatically generates Dataform operations for HTTP Archive reports. + * It creates operations for each combination of: + * - Date range (from startDate to endDate) + * - Metrics (defined in includes/reports.js) + * - SQL types (histogram, timeseries) + * - Lenses (data filters like all, top1k, wordpress, etc.) + * + * Each operation: + * 1. Calculates metrics from crawl data + * 2. Stores results in BigQuery tables + * 3. Exports data to Cloud Storage as JSON + */ -const bucket = 'httparchive' -const storagePath = '/reports/dev/' +// Initialize configurations +const httpArchiveReports = new reports.HTTPArchiveReports() +const availableMetrics = httpArchiveReports.listMetrics() +const availableLenses = httpArchiveReports.lenses + +// Configuration constants +const EXPORT_CONFIG = { + bucket: constants.bucket, + storagePath: constants.storagePath, + dataset: 'reports', + testSuffix: '.json' +} + +// Date range for report generation +// Adjust these dates to update reports retrospectively +const DATE_RANGE = { + startDate: constants.currentMonth, // '2025-07-01' + endDate: constants.currentMonth // '2025-07-01' +} + +/** + * Generates the Cloud Storage export path for a report + * @param {Object} reportConfig - Report configuration object + * @returns {string} - Cloud Storage object path + */ +function buildExportPath(reportConfig) { + const { sql, date, metric } = reportConfig + let objectPath = EXPORT_CONFIG.storagePath -function generateExportQuery (metric, sql, params, ctx) { - let query = '' if (sql.type === 'histogram') { - query = ` -SELECT - * EXCEPT(date) -FROM ${ctx.self()} -WHERE date = '${params.date}' -` + // Histogram exports are organized by date folders + const dateFolder = date.replaceAll('-', '_') + objectPath += `${dateFolder}/${metric.id}` } else if (sql.type === 'timeseries') { - query = ` -SELECT - FORMAT_DATE('%Y_%m_%d', date) AS date, - * EXCEPT(date) -FROM ${ctx.self()} -` + // Timeseries exports are organized by metric + objectPath += metric.id } else { - throw new Error('Unknown SQL type') + throw new Error(`Unknown SQL type: ${sql.type}`) } - const queryOutput = query.replace(/[\r\n]+/g, ' ') - return queryOutput + return objectPath + EXPORT_CONFIG.testSuffix } -function generateExportPath (metric, sql, params) { +/** + * Generates the BigQuery export query for a report + * @param {Object} reportConfig - Report configuration object + * @returns {string} - SQL query for exporting data + */ +function buildExportQuery(reportConfig) { + const { sql, date, metric, lens, tableName } = reportConfig + + let query if (sql.type === 'histogram') { - return `${storagePath}${params.date.replaceAll('-', '_')}/${metric.id}.json` + query = ` + SELECT + * EXCEPT(date, metric, lens) + FROM \`${EXPORT_CONFIG.dataset}.${tableName}\` + WHERE date = '${date}' + AND metric = '${metric.id}' + AND lens = '${lens.name}' + ORDER BY bin ASC + ` } else if (sql.type === 'timeseries') { - return `${storagePath}${metric.id}.json` + query = ` + SELECT + FORMAT_DATE('%Y_%m_%d', date) AS date, + * EXCEPT(date, metric, lens) + FROM \`${EXPORT_CONFIG.dataset}.${tableName}\` + WHERE metric = '${metric.id}' + AND lens = '${lens.name}' + ORDER BY date DESC + ` } else { - throw new Error('Unknown SQL type') + throw new Error(`Unknown SQL type: ${sql.type}`) } + + // Convert to single line for JSON embedding + return query.replace(/[\r\n]+/g, ' ').trim() } -const iterations = [] -for ( - let date = constants.currentMonth; date >= constants.currentMonth; date = constants.fnPastMonth(date)) { - iterations.push({ +/** + * Creates a report configuration object + * @param {string} date - Report date (YYYY-MM-DD) + * @param {Object} metric - Metric configuration + * @param {Object} sql - SQL configuration (type and query) + * @param {string} lensName - Lens name + * @param {string} lensSQL - Lens SQL filter + * @returns {Object} - Complete report configuration + */ +function createReportConfig(date, metric, sql, lensName, lensSQL) { + return { date, - devRankFilter: constants.devRankFilter - }) + metric, + sql, + lens: { name: lensName, sql: lensSQL }, + devRankFilter: constants.devRankFilter, + tableName: `${metric.id}_${sql.type}` + } } -if (iterations.length === 1) { - const params = iterations[0] - metrics.forEach(metric => { - metric.SQL.forEach(sql => { - publish(metric.id + '_' + sql.type, { - type: 'incremental', - protected: true, - bigquery: sql.type === 'histogram' ? { partitionBy: 'date', clusterBy: ['client'] } : {}, - schema: 'reports' - // tags: ['crawl_complete', 'http_reports'] - }).preOps(ctx => ` ---DELETE FROM ${ctx.self()} ---WHERE date = '${params.date}'; - `).query( - ctx => sql.query(ctx, params) - ).postOps(ctx => ` -SELECT - reports.run_export_job( - JSON '''{ - "destination": "cloud_storage", - "config": { - "bucket": "${bucket}", - "name": "${generateExportPath(metric, sql, params)}" - }, - "query": "${generateExportQuery(metric, sql, params, ctx)}" - }''' - ); - `) - }) - }) -} else { - iterations.forEach(params => { - metrics.forEach(metric => { +/** + * Generates all report configurations for the specified date range + * @returns {Array} - Array of report configuration objects + */ +function generateReportConfigurations() { + const reportConfigs = [] + + // Generate configurations for each date in range + for (let date = DATE_RANGE.endDate; + date >= DATE_RANGE.startDate; + date = constants.fnPastMonth(date)) { + + // For each available metric + availableMetrics.forEach(metric => { + // For each SQL type (histogram, timeseries) metric.SQL.forEach(sql => { - operate(metric.id + '_' + sql.type + '_' + params.date, { - // tags: ['crawl_complete', 'http_reports'] - }).queries(ctx => ` -DELETE FROM reports.${metric.id}_${sql.type} -WHERE date = '${params.date}'; - -INSERT INTO reports.${metric.id}_${sql.type}` + sql.query(ctx, params) - ).postOps(ctx => ` - SELECT - reports.run_export_job( - JSON '''{ - "destination": "cloud_storage", - "config": { - "bucket": "${bucket}", - "name": "${generateExportPath(metric, sql, params)}" - }, - "query": "${generateExportQuery(metric, sql, params, ctx)}" - }''' - ); - `) + // For each available lens (all, top1k, wordpress, etc.) + Object.entries(availableLenses).forEach(([lensName, lensSQL]) => { + const config = createReportConfig(date, metric, sql, lensName, lensSQL) + reportConfigs.push(config) + }) }) }) - }) + } + + return reportConfigs +} + +/** + * Creates a Dataform operation name for a report configuration + * @param {Object} reportConfig - Report configuration object + * @returns {string} - Operation name + */ +function createOperationName(reportConfig) { + const { tableName, date, lens } = reportConfig + return `${tableName}_${date}_${lens.name}` +} + +/** + * Generates the SQL for a Dataform operation + * @param {Object} ctx - Dataform context + * @param {Object} reportConfig - Report configuration object + * @returns {string} - Complete SQL for the operation + */ +function generateOperationSQL(ctx, reportConfig) { + const { date, metric, lens, sql, tableName } = reportConfig + + return ` +DECLARE job_config JSON; + +/* First report run - uncomment to create table +CREATE TABLE IF NOT EXISTS ${EXPORT_CONFIG.dataset}.${tableName} +PARTITION BY date +CLUSTER BY metric, lens, client +AS +*/ + +--/* Subsequent report run +DELETE FROM ${EXPORT_CONFIG.dataset}.${tableName} +WHERE date = '${date}' + AND metric = '${metric.id}' + AND lens = '${lens.name}'; +INSERT INTO ${EXPORT_CONFIG.dataset}.${tableName} +--*/ + +SELECT + '${metric.id}' AS metric, + '${lens.name}' AS lens, + * +FROM ( + ${sql.query(ctx, reportConfig)} +); + +SET job_config = TO_JSON( + STRUCT( + "cloud_storage" AS destination, + STRUCT( + "httparchive" AS bucket, + "${buildExportPath(reportConfig)}" AS name + ) AS config, + r"${buildExportQuery(reportConfig)}" AS query + ) +); + +SELECT reports.run_export_job(job_config); +` } + +// Generate all report configurations +const reportConfigurations = generateReportConfigurations() + +// Create Dataform operations for each report configuration +reportConfigurations.forEach(reportConfig => { + const operationName = createOperationName(reportConfig) + + operate(operationName) + .tags(['crawl_complete', 'crawl_reports']) + .queries(ctx => generateOperationSQL(ctx, reportConfig)) +}) diff --git a/includes/constants.js b/includes/constants.js index dec69ff..3f10905 100644 --- a/includes/constants.js +++ b/includes/constants.js @@ -49,6 +49,7 @@ class DataformTemplateBuilder { if (typeof value === 'string') return `'${value}'` if (typeof value === 'number') return value.toString() if (typeof value === 'boolean') return value.toString() + if (typeof value === 'function') return value.toString() // For objects or arrays, use JSON.stringify return JSON.stringify(value) diff --git a/includes/reports.js b/includes/reports.js index 6ca82a8..5d01df6 100644 --- a/includes/reports.js +++ b/includes/reports.js @@ -7,6 +7,20 @@ const config = { { type: 'histogram', query: DataformTemplateBuilder.create((ctx, params) => ` +WITH pages AS ( + SELECT + date, + client, + CAST(FLOOR(FLOAT64(summary.bytesTotal) / 1024 / 100) * 100 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + AND is_root_page + AND FLOAT64(summary.bytesTotal) > 0 +) + SELECT *, SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf @@ -16,48 +30,55 @@ FROM ( volume / SUM(volume) OVER (PARTITION BY client) AS pdf FROM ( SELECT - date, - client, - CAST(FLOOR(INT64(summary.bytesTotal) / 1024 / 100) * 100 AS INT64) AS bin, + *, COUNT(0) AS volume - FROM ${ctx.ref('crawl', 'pages')} - WHERE - date = '${params.date}' ${params.devRankFilter} + FROM pages + WHERE bin IS NOT NULL GROUP BY date, client, bin - HAVING bin IS NOT NULL ) ) +ORDER BY + bin, + client `) }, { type: 'timeseries', query: DataformTemplateBuilder.create((ctx, params) => ` +WITH pages AS ( + SELECT + date, + client, + FLOAT64(summary.bytesTotal) AS bytesTotal + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + AND is_root_page + AND INT64(summary.bytesTotal) > 0 +) + SELECT date, client, - UNIX_SECONDS(TIMESTAMP(date)) AS timestamp, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(101)] / 1024, 2) AS p10, ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(251)] / 1024, 2) AS p25, ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(501)] / 1024, 2) AS p50, ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(751)] / 1024, 2) AS p75, ROUND(APPROX_QUANTILES(bytesTotal, 1001)[OFFSET(901)] / 1024, 2) AS p90 -FROM ( - SELECT - date, - client, - INT64(summary.bytesTotal) AS bytesTotal - FROM ${ctx.ref('crawl', 'pages')} - WHERE - date = '${params.date}' ${params.devRankFilter} AND - INT64(summary.bytesTotal) > 0 -) +FROM pages GROUP BY date, client, timestamp +ORDER BY + date, + client `) } ] @@ -65,12 +86,24 @@ GROUP BY } } +const lenses = { + all: '', + top1k: 'AND rank <= 1000', + top10k: 'AND rank <= 10000', + top100k: 'AND rank <= 100000', + top1m: 'AND rank <= 1000000', + drupal: 'AND \'Drupal\' IN UNNEST(technologies.technology)', + magento: 'AND \'Magento\' IN UNNEST(technologies.technology)', + wordpress: 'AND \'WordPress\' IN UNNEST(technologies.technology)' +} + class HTTPArchiveReports { - constructor () { + constructor() { this.config = config + this.lenses = lenses } - listReports () { + listReports() { const reportIds = this.config._reports const reports = reportIds.map(reportId => { @@ -81,7 +114,7 @@ class HTTPArchiveReports { return reports } - getReport (reportId) { + getReport(reportId) { const report = this.config[reportId] return { id: reportId, @@ -89,7 +122,7 @@ class HTTPArchiveReports { } } - listMetrics (reportId) { + listMetrics(reportId) { if (reportId === undefined) { const metrics = Object.keys(this.config._metrics).map(metricId => { const metric = this.getMetric(metricId) @@ -110,7 +143,7 @@ class HTTPArchiveReports { } } - getMetric (metricId) { + getMetric(metricId) { const metric = this.config._metrics[metricId] return { diff --git a/infra/tf/dataform.tf b/infra/tf/dataform.tf index 60d7c08..b0d3c80 100644 --- a/infra/tf/dataform.tf +++ b/infra/tf/dataform.tf @@ -7,10 +7,8 @@ locals { "sample_data", "wappalyzer", - // Blink features - "blink_features", - // Reports + "blink_features", "core_web_vitals", // TODO: Remove after tech report migration "reports", diff --git a/reports.md b/reports.md new file mode 100644 index 0000000..8b4ddb4 --- /dev/null +++ b/reports.md @@ -0,0 +1,335 @@ +# HTTP Archive Reports + +This document describes the HTTP Archive reports system, which automatically generates standardized reports from HTTP Archive crawl data. + +## Overview + +The reports system generates Dataform operations that: + +1. Calculate metrics from HTTP Archive crawl data +2. Store results in BigQuery tables partitioned by date and clustered by metric/lens/client +3. Export data to Cloud Storage as JSON files for consumption by external systems + +## Architecture + +### Core Components + +- **`includes/reports.js`** - Defines metrics and lenses +- **`definitions/output/reports/reports_dynamic.js`** - Generates Dataform operations +- **`includes/constants.js`** - Provides shared constants and the `DataformTemplateBuilder` + +## Supported Features + +### SQL Types + +The system supports two types of SQL queries: + +#### 1. Histogram + +- **Purpose**: Distribution analysis with binned data +- **Output**: Contains `bin`, `volume`, `pdf`, `cdf` columns +- **Use case**: Page weight distributions, performance metric distributions +- **Export path**: `reports/{date_folder}/{metric_id}_test.json` + +#### 2. Timeseries + +- **Purpose**: Trend analysis over time +- **Output**: Contains percentile data (p10, p25, p50, p75, p90) with timestamps +- **Use case**: Performance trends, adoption over time +- **Export path**: `reports/{metric_id}_test.json` + +### Lenses (Data Filters) + +Lenses allow filtering data by different criteria: + +- **`all`** - No filter, all pages +- **`top1k`** - Top 1,000 ranked sites +- **`top10k`** - Top 10,000 ranked sites +- **`top100k`** - Top 100,000 ranked sites +- **`top1m`** - Top 1,000,000 ranked sites +- **`drupal`** - Sites using Drupal +- **`magento`** - Sites using Magento +- **`wordpress`** - Sites using WordPress + +### Date Range Processing + +- Configurable start and end dates +- Processes data month by month using `constants.fnPastMonth()` +- Supports retrospective report generation + +## How to Add a New Report + +### Step 1: Define Your Metric + +Add your metric to the `_metrics` object in `includes/reports.js`: + +```javascript +const config = { + _metrics: { + // Existing metrics... + + myNewMetric: { + SQL: [ + { + type: 'histogram', // or 'timeseries' + query: DataformTemplateBuilder.create((ctx, params) => ` + WITH pages AS ( + SELECT + date, + client, + -- Your binning logic for histogram + CAST(FLOOR(your_metric_value / bin_size) * bin_size AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + AND is_root_page + AND your_metric_value > 0 + ) + + -- Your aggregation logic here + SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf + FROM ( + -- Calculate probability density function + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + *, + COUNT(0) AS volume + FROM pages + WHERE bin IS NOT NULL + GROUP BY date, client, bin + ) + ) + ORDER BY bin, client + `) + } + ] + } + } +} +``` + +### Step 2: Test Your Metric + +The metric will be automatically included in the next run of `reports_dynamic.js`. The system will generate operations for all combinations of: + +- Your new metric +- All available lenses +- All SQL types you defined +- The configured date range + +### Step 3: Verify Output + +Check that the generated operations: + +1. Create the expected BigQuery tables +2. Populate data correctly +3. Export to Cloud Storage in the expected format + +## Metric SQL Requirements + +### Template Parameters + +Your SQL template receives these parameters: + +```javascript +{ + date: '2025-07-01', // Current processing date + devRankFilter: 'AND rank <= 10000', // Development filter + lens: { + name: 'top1k', // Lens name + sql: 'AND rank <= 1000' // Lens SQL filter + }, + metric: { id: 'myMetric', ... }, // Metric configuration + sql: { type: 'histogram', ... } // SQL type configuration +} +``` + +### Required Columns + +#### For Histogram Type + +- `date` - Processing date +- `client` - 'desktop' or 'mobile' +- `bin` - Numeric bin value +- `volume` - Count of pages in this bin +- `pdf` - Probability density function value +- `cdf` - Cumulative distribution function value + +#### For Timeseries Type + +- `date` - Processing date +- `client` - 'desktop' or 'mobile' +- `timestamp` - Unix timestamp in milliseconds +- `p10`, `p25`, `p50`, `p75`, `p90` - Percentile values + +### Best Practices + +1. **Filter root pages**: Always include `AND is_root_page` unless you specifically need all pages +2. **Handle null values**: Use appropriate null checks and filtering +3. **Use consistent binning**: For histograms, use logical bin sizes (e.g., 100KB increments for page weight) +4. **Optimize performance**: Use appropriate WHERE clauses and avoid expensive operations +5. **Test with dev filters**: Your queries should work with the development rank filter + +## Lenses + +Lenses SQL are a valid BigQuery WHERE clause conditions that can be appended to the main query. + +## Processing Details + +### Operation Generation + +For each combination of date, metric, SQL type, and lens, the system: + +1. **Creates a unique operation name**: `{metricId}_{sqlType}_{date}_{lensName}` +2. **Generates BigQuery SQL** that: + - Deletes existing data for the date/metric/lens combination + - Inserts new calculated data + - Exports results to Cloud Storage +3. **Tags operations** with `crawl_complete` tags to be triggered on crawl completion. + +### Table Structure + +Reports are stored in BigQuery tables with this structure: + +- **Partitioned by**: `date` +- **Clustered by**: `metric`, `lens`, `client` +- **Dataset**: `reports` +- **Naming**: `{metricId}_{sqlType}` (e.g., `bytesTotal_histogram`) + +### Export Process + +1. Data is calculated and stored in BigQuery +2. A `run_export_job` function exports filtered data to Cloud Storage +3. Export paths follow the pattern: + - Histogram: `reports/[{lens}/]{date_underscore}/{metric_id}.json` + - Timeseries: `reports/[{lens}/]{metric_id}.json` + +### Development vs Production + +- **Development**: Uses `TABLESAMPLE` and rank filters for faster processing +- **Production**: Processes full datasets +- **Environment detection**: Automatic based on `dataform.projectConfig.vars.environment` + +## Configuration + +### Date Range + +Modify the `DATE_RANGE` object in `reports_dynamic.js`: + +```javascript +const DATE_RANGE = { + startDate: '2025-01-01', // Start processing from this date + endDate: '2025-07-01' // Process up to this date +} +``` + +### Export Configuration + +Modify the `EXPORT_CONFIG` object: + +```javascript +const EXPORT_CONFIG = { + bucket: 'your-storage-bucket', + storagePath: 'reports/', + dataset: 'reports', + testSuffix: '.json' +} +``` + +## Troubleshooting + +### Debugging + +1. **Check operation logs** in Dataform for SQL errors +2. **Verify table creation** in BigQuery console +3. **Check export logs** in Cloud Run for export errors +4. **Verify Cloud Storage paths** for exported files +5. **Test SQL templates** individually before adding +6. **Use development environment** with smaller datasets for testing + +## Examples + +### Adding a JavaScript Bundle Size Metric + +```javascript +jsBytes: { + SQL: [ + { + type: 'histogram', + query: DataformTemplateBuilder.create((ctx, params) => ` + WITH pages AS ( + SELECT + date, + client, + CAST(FLOOR(FLOAT64(summary.bytesJS) / 1024 / 50) * 50 AS INT64) AS bin + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + AND is_root_page + AND INT64(summary.bytesJS) > 0 + ) + + SELECT + *, + SUM(pdf) OVER (PARTITION BY client ORDER BY bin) AS cdf + FROM ( + SELECT + *, + volume / SUM(volume) OVER (PARTITION BY client) AS pdf + FROM ( + SELECT + *, + COUNT(0) AS volume + FROM pages + WHERE bin IS NOT NULL + GROUP BY date, client, bin + ) + ) + ORDER BY bin, client + `) + }, + { + type: 'timeseries', + query: DataformTemplateBuilder.create((ctx, params) => ` + WITH pages AS ( + SELECT + date, + client, + FLOAT64(summary.bytesJS) AS bytesJS + FROM ${ctx.ref('crawl', 'pages')} + WHERE + date = '${params.date}' + ${params.devRankFilter} + ${params.lens.sql} + AND is_root_page + AND INT64(summary.bytesJS) > 0 + ) + + SELECT + date, + client, + UNIX_DATE(date) * 1000 * 60 * 60 * 24 AS timestamp, + ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(101)] / 1024, 2) AS p10, + ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(251)] / 1024, 2) AS p25, + ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(501)] / 1024, 2) AS p50, + ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(751)] / 1024, 2) AS p75, + ROUND(APPROX_QUANTILES(bytesJS, 1001)[OFFSET(901)] / 1024, 2) AS p90 + FROM pages + GROUP BY date, client, timestamp + ORDER BY date, client + `) + } + ] +} +``` + +This would automatically generate reports for JavaScript bundle sizes across all lenses and the configured date range.