Skip to content

Commit da6ee35

Browse files
authored
Addtitional Mailing List stats in release report (#1712)
1 parent ca56fc8 commit da6ee35

17 files changed

+545
-33
lines changed

docs/commands.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,3 +320,19 @@ For this to work `SLACK_BOT_API` must be set in the `.env` file.
320320
| Options | Format | Description |
321321
|----------------------|--------|--------------------------------------------------------------|
322322
| `--user_id` | int | If passed, the user with this ID will receive email notifications when this task is started and finished, or if the task raises and exception. |
323+
324+
325+
## `import_ml_counts`
326+
327+
**Purpose**: Import mailing list counts from the mailman archives.
328+
329+
```bash
330+
./manage.py import_ml_counts
331+
```
332+
333+
**Options**
334+
335+
| Options | Format | Description |
336+
|----------------|--------|----------------------------------------------------------------------------------------------------------------------|
337+
| `--start_date` | date | If passed, retrieves data from the start date supplied, d-m-y, default 20-11-1998 (the start of the data in mailman) |
338+
| `--end_date` | date | If passed, If passed, retrieves data until the start date supplied, d-m-y, default today |

docs/development_setup_notes.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -313,9 +313,9 @@ https://docs.allauth.org/en/latest/socialaccount/providers/google.html
313313
1. `TF_VAR_google_cloud_email` (the email address of your Google Cloud account)
314314
2. `TF_VAR_google_organization_domain` (usually the domain of your Google Cloud account, e.g. "boost.org" if you will be using an @boost.org email address)
315315
3. `TF_VAR_google_cloud_project_name` (optional, default: localboostdev) - needs to change if destroyed and a setup is needed within 30 days
316-
2. Run `make development-tofu-init` to initialize tofu.
317-
3. Run `make development-tofu-plan` to confirm the planned changes.
318-
4. Run `make development-tofu-apply` to apply the changes.
316+
2. Run `just development-tofu-init` to initialize tofu.
317+
3. Run `just development-tofu-plan` to confirm the planned changes.
318+
4. Run `just development-tofu-apply` to apply the changes.
319319
5. Go to https://console.developers.google.com/
320320
1. Search for the newly created project, named "Boost Development" (ID: localboostdev by default).
321321
2. Type "credentials" in the search input at the top of the page.
@@ -352,6 +352,7 @@ In your env:
352352

353353
#### Set Up Pycharm
354354
You can set up your IDE with a new "Python Debug Server" configuration as:
355+
355356
<img src="images/pycharm_debugger_settings.png" alt="PyCharm Debugger Settings" width="400">
356357

357358
#### Common Usage

docs/first_time_data_import.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ The `boost_setup` command will run all of the processes listed here:
3838

3939
# Get the most recent beta release, and delete old beta releases
4040
./manage.py import_beta_release --delete-versions
41+
./manage.py import_ml_counts
4142
```
4243

4344
Read more aboout these [management commands](./commands.md).

docs/release_reports.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
# Release Reports
22

3+
## Prerequisites
4+
5+
1. You should upload updated subscriber data.
6+
1. Ask Sam for a copy of the "subscribe" data.
7+
2. In the Django admin interface go to "Subscription datas" under "MAILING_LIST".
8+
3. At the top of the page click on the "IMPORT 'SUBSCRIBE' DATA" button.
9+
2. To update the mailing list counts, if you haven't already run the "DO IT ALL" button:
10+
1. Go to "Versions" under "VERSIONS" in the admin interface
11+
2. At the top of the page click on the "DO IT ALL" button.
12+
13+
## Report Creation
14+
315
1. Go to /admin
416
2. Go to the "Libraries" section
517
3. In the top menu click on "GET RELEASE REPORT".

libraries/forms.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,11 @@
1010
from django.forms import Form, ModelChoiceField, ModelForm, BooleanField
1111

1212
from core.models import RenderedContent
13-
from reports.generation import generate_wordcloud
13+
from reports.generation import (
14+
generate_wordcloud,
15+
get_mailing_list_post_stats,
16+
get_new_subscribers_stats,
17+
)
1418
from slack.models import Channel, SlackActivityBucket, SlackUser
1519
from versions.models import Version
1620
from .models import (
@@ -772,6 +776,12 @@ def get_stats(self):
772776
Channel.objects.filter(name__istartswith="boost").order_by("name"), 10
773777
)
774778
committee_members = version.financial_committee_members.all()
779+
mailinglist_post_stats = get_mailing_list_post_stats(
780+
prior_version.release_date, version.release_date
781+
)
782+
new_subscribers_stats = get_new_subscribers_stats(
783+
prior_version.release_date, version.release_date
784+
)
775785
library_index_library_data = []
776786
for library in self._get_libraries_by_quality():
777787
library_index_library_data.append(
@@ -804,6 +814,8 @@ def get_stats(self):
804814
"mailinglist_total": total_mailinglist_count or 0,
805815
"mailinglist_contributor_release_count": mailinglist_contributor_release_count, # noqa: E501
806816
"mailinglist_contributor_new_count": mailinglist_contributor_new_count,
817+
"mailinglist_post_stats": mailinglist_post_stats,
818+
"mailinglist_new_subscribers_stats": new_subscribers_stats,
807819
"commit_contributors_release_count": commit_contributors_release_count,
808820
"commit_contributors_new_count": commit_contributors_new_count,
809821
"global_contributors_new_count": len(

libraries/management/commands/release_tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ def __init__(self, should_generate_report: bool = False):
8282
ReleaseTask("Updating github issues", ["update_issues"]),
8383
ReleaseTask("Updating slack activity buckets", ["fetch_slack_activity"]),
8484
ReleaseTask("Updating website statistics", self.update_website_statistics),
85+
ReleaseTask("Importing mailing list counts", ["import_ml_counts"]),
8586
ReleaseTask("Generating report", self.generate_report),
8687
]
8788

mailing_list/admin.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,21 @@
1+
import csv
2+
import logging
3+
import re
4+
from datetime import datetime
5+
from io import TextIOWrapper
6+
7+
from django import forms
8+
from django.shortcuts import redirect, render
19
from django.urls import path
210
from django.http import HttpResponseRedirect
311
from django.contrib import admin, messages
412
from django.conf import settings
513

6-
from mailing_list.models import EmailData
14+
from mailing_list.models import EmailData, SubscriptionData
715
from mailing_list.tasks import sync_mailinglist_stats
816

17+
logger = logging.getLogger(__name__)
18+
919

1020
@admin.register(EmailData)
1121
class EmailDataAdmin(admin.ModelAdmin):
@@ -43,3 +53,62 @@ def sync_mailinglist_stats(self, request):
4353

4454
def has_add_permission(self, request):
4555
return False
56+
57+
58+
class SubscribesCSVForm(forms.Form):
59+
csv_file = forms.FileField()
60+
61+
62+
@admin.register(SubscriptionData)
63+
class SubscriptionDataAdmin(admin.ModelAdmin):
64+
list_display = ["subscription_dt", "email"]
65+
search_fields = ["email"]
66+
change_list_template = "admin/mailinglist_change_list.html"
67+
68+
email_regex = re.compile("([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
69+
70+
def get_urls(self):
71+
return [
72+
path("import-csv", self.import_csv, name="import_csv")
73+
] + super().get_urls()
74+
75+
def parse_rows(self, reader):
76+
for row in reader:
77+
date_str = " ".join(row[0:4])
78+
try:
79+
dt = datetime.strptime(date_str, "%b %d %H:%M:%S %Y")
80+
except ValueError:
81+
logger.error(f"Error parsing date {date_str} from {row=}")
82+
dt = None
83+
# re-merge, the email address isn't always in a consistent position
84+
email_matches = re.search(self.email_regex, " ".join(row[6:]))
85+
email = email_matches.group(0) if email_matches else None
86+
entry_type = row[6]
87+
# only save confirmed subscriber entries, it's all we need for now
88+
if entry_type != "new":
89+
continue
90+
if not email:
91+
logger.error(
92+
f"Invalid email {row=} {email_matches=} {' '.join(row[6:])=}"
93+
)
94+
continue
95+
yield SubscriptionData(
96+
email=email,
97+
entry_type=entry_type,
98+
list=row[5].rstrip(":-1"),
99+
subscription_dt=dt,
100+
)
101+
102+
def import_csv(self, request):
103+
if request.method == "POST":
104+
csv_file = request.FILES["csv_file"]
105+
rows = TextIOWrapper(csv_file, encoding="ISO-8859-1", newline="")
106+
reader = csv.reader(rows, delimiter=" ")
107+
SubscriptionData.objects.bulk_create(
108+
self.parse_rows(reader), batch_size=500, ignore_conflicts=True
109+
)
110+
self.message_user(request, "Subscribe CSV file imported.")
111+
return redirect("..")
112+
113+
payload = {"form": SubscribesCSVForm()}
114+
return render(request, "admin/mailinglist_subscribe_csv_form.html", payload)

mailing_list/constants.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# we only want boost devel for now, leaving the others in case that changes.
2+
ML_STATS_URLS = [
3+
"https://lists.boost.org/Archives/boost/{:04}/{:02}/author.php",
4+
# "https://lists.boost.org/boost-users/{:04}/{:02}/author.php",
5+
# "https://lists.boost.org/boost-announce/{:04}/{:02}/author.php",
6+
]
7+
ARG_DATE_REGEX = r"^([0-9]+)(?:$|(?:-|/)([0-9]+)(?:$|(?:-|/)([0-9]+)$))"
8+
AUTHOR_PATTERN_REGEX = r"<li><strong>(.*)</strong>"
9+
DATE_PATTERN_REGEX = r".*<em>\((\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\)</em>"
10+
11+
# used to map latin-1 characters to their utf-8 equivalents in the mailing list
12+
# page html parser
13+
LATIN_1_EQUIVS = {
14+
8364: 128,
15+
8218: 130,
16+
402: 131,
17+
8222: 132,
18+
8230: 133,
19+
8224: 134,
20+
8225: 135,
21+
710: 136,
22+
8240: 137,
23+
352: 138,
24+
8249: 139,
25+
338: 140,
26+
381: 142,
27+
8216: 145,
28+
8217: 146,
29+
8220: 147,
30+
8221: 148,
31+
8226: 149,
32+
8211: 150,
33+
8212: 151,
34+
732: 152,
35+
8482: 153,
36+
353: 154,
37+
8250: 155,
38+
339: 156,
39+
382: 158,
40+
376: 159,
41+
}
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# Copyright 2024 Dave O'Connor
2+
# Derived from code by Joaquin M Lopez Munoz.
3+
# Distributed under the Boost Software License, Version 1.0.
4+
# (See accompanying file LICENSE_1_0.txt or copy at
5+
# http://www.boost.org/LICENSE_1_0.txt)
6+
import djclick as click
7+
import logging
8+
import re
9+
import warnings
10+
from datetime import timedelta, datetime
11+
import html
12+
13+
from dateutil.relativedelta import relativedelta
14+
from unidecode import unidecode
15+
16+
import requests
17+
18+
from mailing_list.constants import (
19+
ML_STATS_URLS,
20+
LATIN_1_EQUIVS,
21+
ARG_DATE_REGEX,
22+
AUTHOR_PATTERN_REGEX,
23+
DATE_PATTERN_REGEX,
24+
)
25+
from mailing_list.models import PostingData
26+
27+
logger = logging.getLogger(__name__)
28+
29+
arg_date_pattern = re.compile(ARG_DATE_REGEX)
30+
author_pattern = re.compile(AUTHOR_PATTERN_REGEX)
31+
date_pattern = re.compile(DATE_PATTERN_REGEX)
32+
33+
34+
def decode_broken_html(str):
35+
def latin_1_ord(char):
36+
n = ord(char)
37+
return LATIN_1_EQUIVS.get(n, n)
38+
39+
with warnings.catch_warnings():
40+
warnings.simplefilter("ignore")
41+
return unidecode(
42+
bytearray(map(latin_1_ord, html.unescape(str))).decode("utf-8", "ignore")
43+
)
44+
45+
46+
def parse_start_datetime(date_str):
47+
m = arg_date_pattern.match(date_str)
48+
if not m:
49+
raise ValueError("wrong date format")
50+
logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}")
51+
return datetime(
52+
int(m.group(3)) if m.group(3) else 1,
53+
int(m.group(2)) if m.group(2) else 1,
54+
int(m.group(1)),
55+
0,
56+
0,
57+
0,
58+
)
59+
60+
61+
def parse_end_datetime(date_str):
62+
m = arg_date_pattern.match(date_str)
63+
if not m:
64+
raise ValueError("wrong date format")
65+
logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}")
66+
if m.group(2):
67+
if m.group(3):
68+
return datetime(
69+
int(m.group(3)), int(m.group(2)), int(m.group(1)), 23, 59, 59
70+
)
71+
else:
72+
return (
73+
datetime(int(m.group(1)), int(m.group(2)), 1) + timedelta(days=31),
74+
23,
75+
59,
76+
59,
77+
).replace(day=1) - timedelta(days=1)
78+
return datetime(int(m.group(1)), 12, 31, 23, 59, 59)
79+
80+
81+
def retrieve_authors_from_ml(url, start_date, end_date):
82+
posts = []
83+
logger.info(f"Retrieving data from {url=}.")
84+
r = requests.get(url)
85+
if r.status_code == 404:
86+
return posts
87+
88+
author = None
89+
for line in r.text.splitlines():
90+
author_match = author_pattern.match(line)
91+
if author_match:
92+
# needs multiple passes to work
93+
author = decode_broken_html(author_match.group(1))
94+
else:
95+
date_pattern_match = date_pattern.match(line)
96+
if author and date_pattern_match:
97+
post_date = datetime.strptime(
98+
date_pattern_match.group(1), "%Y-%m-%d %H:%M:%S"
99+
)
100+
if start_date <= post_date and post_date <= end_date:
101+
posts.append(PostingData(name=author, post_time=post_date))
102+
return posts
103+
104+
105+
def retrieve_authors(start_date, end_date):
106+
logger.info(f"retrieve_authors from {start_date=} to {end_date=}")
107+
start_month = datetime(start_date.year, start_date.month, 1)
108+
end_month = datetime(end_date.year, end_date.month, 1)
109+
authors = []
110+
while start_month <= end_month:
111+
for ml in ML_STATS_URLS:
112+
authors += retrieve_authors_from_ml(
113+
ml.format(start_month.year, start_month.month), start_date, end_date
114+
)
115+
start_month = start_month + relativedelta(months=+1)
116+
PostingData.objects.filter(
117+
post_time__gte=start_date, post_time__lte=end_date
118+
).delete()
119+
PostingData.objects.bulk_create(authors)
120+
121+
122+
@click.command()
123+
@click.option("--start_date", is_flag=False, help="Start Date", default=None)
124+
@click.option("--end_date", is_flag=False, help="End Date", default=None)
125+
def command(start_date, end_date):
126+
logger.info(f"Starting import_ml_counts {start_date=} {end_date=}")
127+
start_date = (
128+
parse_start_datetime(start_date) if start_date else datetime(1998, 11, 11)
129+
)
130+
logger.info(f"{start_date=}")
131+
end_date = parse_end_datetime(end_date) if end_date else datetime.now()
132+
logger.info(f"{end_date=}")
133+
retrieve_authors(start_date, end_date)

0 commit comments

Comments
 (0)