boostorg
diff --git a/‎docs/commands.md‎
Lines changed: 16 additions & 0 deletions b/‎docs/commands.md‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docs/development_setup_notes.md‎
Lines changed: 4 additions & 3 deletions b/‎docs/development_setup_notes.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/first_time_data_import.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/first_time_data_import.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/release_reports.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/release_reports.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎libraries/forms.py‎
Lines changed: 13 additions & 1 deletion b/‎libraries/forms.py‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎libraries/management/commands/release_tasks.py‎
Lines changed: 1 addition & 0 deletions b/‎libraries/management/commands/release_tasks.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mailing_list/admin.py‎
Lines changed: 70 additions & 1 deletion b/‎mailing_list/admin.py‎
Lines changed: 70 additions & 1 deletion
diff --git a/‎mailing_list/constants.py‎
Lines changed: 41 additions & 0 deletions b/‎mailing_list/constants.py‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎mailing_list/management/commands/import_ml_counts.py‎
Lines changed: 133 additions & 0 deletions b/‎mailing_list/management/commands/import_ml_counts.py‎
Lines changed: 133 additions & 0 deletions
@@ -320,3 +320,19 @@ For this to work `SLACK_BOT_API` must be set in the `.env` file.
 | Options              | Format | Description                                                  |
 |----------------------|--------|--------------------------------------------------------------|
 | `--user_id`  | int  | If passed, the user with this ID will receive email notifications when this task is started and finished, or if the task raises and exception. |
+
+
+## `import_ml_counts`
+
+**Purpose**: Import mailing list counts from the mailman archives.
+
+```bash
+./manage.py import_ml_counts
+```
+
+**Options**
+
+| Options        | Format | Description                                                                                                          |
+|----------------|--------|----------------------------------------------------------------------------------------------------------------------|
+| `--start_date` | date   | If passed, retrieves data from the start date supplied, d-m-y, default 20-11-1998 (the start of the data in mailman) |
+| `--end_date`   | date   | If passed, If passed, retrieves data until the start date supplied, d-m-y, default today                             |
@@ -313,9 +313,9 @@ https://docs.allauth.org/en/latest/socialaccount/providers/google.html
    1. `TF_VAR_google_cloud_email` (the email address of your Google Cloud account)
    2. `TF_VAR_google_organization_domain` (usually the domain of your Google Cloud account, e.g. "boost.org" if you will be using an @boost.org email address)
    3. `TF_VAR_google_cloud_project_name` (optional, default: localboostdev) - needs to change if destroyed and a setup is needed within 30 days
-2. Run `make development-tofu-init` to initialize tofu.
-3. Run `make development-tofu-plan` to confirm the planned changes.
-4. Run `make development-tofu-apply` to apply the changes.
+2. Run `just development-tofu-init` to initialize tofu.
+3. Run `just development-tofu-plan` to confirm the planned changes.
+4. Run `just development-tofu-apply` to apply the changes.
 5. Go to https://console.developers.google.com/
    1. Search for the newly created project, named "Boost Development" (ID: localboostdev by default).
    2. Type "credentials" in the search input at the top of the page.
@@ -352,6 +352,7 @@ In your env:
 
 #### Set Up Pycharm
 You can set up your IDE with a new "Python Debug Server" configuration as:
+
 <img src="images/pycharm_debugger_settings.png" alt="PyCharm Debugger Settings" width="400">
 
 #### Common Usage
 
@@ -38,6 +38,7 @@ The `boost_setup` command will run all of the processes listed here:
 
 # Get the most recent beta release, and delete old beta releases
 ./manage.py import_beta_release --delete-versions
+./manage.py import_ml_counts
 ```
 
 Read more aboout these [management commands](./commands.md).
 
@@ -1,5 +1,17 @@
 # Release Reports
 
+## Prerequisites
+
+1. You should upload updated subscriber data.
+   1. Ask Sam for a copy of the "subscribe" data.
+   2. In the Django admin interface go to "Subscription datas" under "MAILING_LIST".
+   3. At the top of the page click on the "IMPORT 'SUBSCRIBE' DATA" button.
+2. To update the mailing list counts, if you haven't already run the "DO IT ALL" button:
+   1. Go to "Versions" under "VERSIONS" in the admin interface
+   2. At the top of the page click on the "DO IT ALL" button.
+
+## Report Creation
+
 1. Go to /admin
 2. Go to the "Libraries" section
 3. In the top menu click on "GET RELEASE REPORT".
 
@@ -10,7 +10,11 @@
 from django.forms import Form, ModelChoiceField, ModelForm, BooleanField
 
 from core.models import RenderedContent
-from reports.generation import generate_wordcloud
+from reports.generation import (
+    generate_wordcloud,
+    get_mailing_list_post_stats,
+    get_new_subscribers_stats,
+)
 from slack.models import Channel, SlackActivityBucket, SlackUser
 from versions.models import Version
 from .models import (
@@ -772,6 +776,12 @@ def get_stats(self):
             Channel.objects.filter(name__istartswith="boost").order_by("name"), 10
         )
         committee_members = version.financial_committee_members.all()
+        mailinglist_post_stats = get_mailing_list_post_stats(
+            prior_version.release_date, version.release_date
+        )
+        new_subscribers_stats = get_new_subscribers_stats(
+            prior_version.release_date, version.release_date
+        )
         library_index_library_data = []
         for library in self._get_libraries_by_quality():
             library_index_library_data.append(
@@ -804,6 +814,8 @@ def get_stats(self):
             "mailinglist_total": total_mailinglist_count or 0,
             "mailinglist_contributor_release_count": mailinglist_contributor_release_count,  # noqa: E501
             "mailinglist_contributor_new_count": mailinglist_contributor_new_count,
+            "mailinglist_post_stats": mailinglist_post_stats,
+            "mailinglist_new_subscribers_stats": new_subscribers_stats,
             "commit_contributors_release_count": commit_contributors_release_count,
             "commit_contributors_new_count": commit_contributors_new_count,
             "global_contributors_new_count": len(
 
@@ -82,6 +82,7 @@ def __init__(self, should_generate_report: bool = False):
             ReleaseTask("Updating github issues", ["update_issues"]),
             ReleaseTask("Updating slack activity buckets", ["fetch_slack_activity"]),
             ReleaseTask("Updating website statistics", self.update_website_statistics),
+            ReleaseTask("Importing mailing list counts", ["import_ml_counts"]),
             ReleaseTask("Generating report", self.generate_report),
         ]
 
 
@@ -1,11 +1,21 @@
+import csv
+import logging
+import re
+from datetime import datetime
+from io import TextIOWrapper
+
+from django import forms
+from django.shortcuts import redirect, render
 from django.urls import path
 from django.http import HttpResponseRedirect
 from django.contrib import admin, messages
 from django.conf import settings
 
-from mailing_list.models import EmailData
+from mailing_list.models import EmailData, SubscriptionData
 from mailing_list.tasks import sync_mailinglist_stats
 
+logger = logging.getLogger(__name__)
+
 
 @admin.register(EmailData)
 class EmailDataAdmin(admin.ModelAdmin):
@@ -43,3 +53,62 @@ def sync_mailinglist_stats(self, request):
 
     def has_add_permission(self, request):
         return False
+
+
+class SubscribesCSVForm(forms.Form):
+    csv_file = forms.FileField()
+
+
+@admin.register(SubscriptionData)
+class SubscriptionDataAdmin(admin.ModelAdmin):
+    list_display = ["subscription_dt", "email"]
+    search_fields = ["email"]
+    change_list_template = "admin/mailinglist_change_list.html"
+
+    email_regex = re.compile("([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
+
+    def get_urls(self):
+        return [
+            path("import-csv", self.import_csv, name="import_csv")
+        ] + super().get_urls()
+
+    def parse_rows(self, reader):
+        for row in reader:
+            date_str = " ".join(row[0:4])
+            try:
+                dt = datetime.strptime(date_str, "%b %d %H:%M:%S %Y")
+            except ValueError:
+                logger.error(f"Error parsing date {date_str} from {row=}")
+                dt = None
+            # re-merge, the email address isn't always in a consistent position
+            email_matches = re.search(self.email_regex, " ".join(row[6:]))
+            email = email_matches.group(0) if email_matches else None
+            entry_type = row[6]
+            # only save confirmed subscriber entries, it's all we need for now
+            if entry_type != "new":
+                continue
+            if not email:
+                logger.error(
+                    f"Invalid email {row=} {email_matches=} {' '.join(row[6:])=}"
+                )
+                continue
+            yield SubscriptionData(
+                email=email,
+                entry_type=entry_type,
+                list=row[5].rstrip(":-1"),
+                subscription_dt=dt,
+            )
+
+    def import_csv(self, request):
+        if request.method == "POST":
+            csv_file = request.FILES["csv_file"]
+            rows = TextIOWrapper(csv_file, encoding="ISO-8859-1", newline="")
+            reader = csv.reader(rows, delimiter=" ")
+            SubscriptionData.objects.bulk_create(
+                self.parse_rows(reader), batch_size=500, ignore_conflicts=True
+            )
+            self.message_user(request, "Subscribe CSV file imported.")
+            return redirect("..")
+
+        payload = {"form": SubscribesCSVForm()}
+        return render(request, "admin/mailinglist_subscribe_csv_form.html", payload)
@@ -0,0 +1,41 @@
+# we only want boost devel for now, leaving the others in case that changes.
+ML_STATS_URLS = [
+    "https://lists.boost.org/Archives/boost/{:04}/{:02}/author.php",
+    # "https://lists.boost.org/boost-users/{:04}/{:02}/author.php",
+    # "https://lists.boost.org/boost-announce/{:04}/{:02}/author.php",
+]
+ARG_DATE_REGEX = r"^([0-9]+)(?:$|(?:-|/)([0-9]+)(?:$|(?:-|/)([0-9]+)$))"
+AUTHOR_PATTERN_REGEX = r"<li><strong>(.*)</strong>"
+DATE_PATTERN_REGEX = r".*<em>\((\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\)</em>"
+
+# used to map latin-1 characters to their utf-8 equivalents in the mailing list
+# page html parser
+LATIN_1_EQUIVS = {
+    8364: 128,
+    8218: 130,
+    402: 131,
+    8222: 132,
+    8230: 133,
+    8224: 134,
+    8225: 135,
+    710: 136,
+    8240: 137,
+    352: 138,
+    8249: 139,
+    338: 140,
+    381: 142,
+    8216: 145,
+    8217: 146,
+    8220: 147,
+    8221: 148,
+    8226: 149,
+    8211: 150,
+    8212: 151,
+    732: 152,
+    8482: 153,
+    353: 154,
+    8250: 155,
+    339: 156,
+    382: 158,
+    376: 159,
+}
@@ -0,0 +1,133 @@
+# Copyright 2024 Dave O'Connor
+# Derived from code by Joaquin M Lopez Munoz.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+import djclick as click
+import logging
+import re
+import warnings
+from datetime import timedelta, datetime
+import html
+
+from dateutil.relativedelta import relativedelta
+from unidecode import unidecode
+
+import requests
+
+from mailing_list.constants import (
+    ML_STATS_URLS,
+    LATIN_1_EQUIVS,
+    ARG_DATE_REGEX,
+    AUTHOR_PATTERN_REGEX,
+    DATE_PATTERN_REGEX,
+)
+from mailing_list.models import PostingData
+
+logger = logging.getLogger(__name__)
+
+arg_date_pattern = re.compile(ARG_DATE_REGEX)
+author_pattern = re.compile(AUTHOR_PATTERN_REGEX)
+date_pattern = re.compile(DATE_PATTERN_REGEX)
+
+
+def decode_broken_html(str):
+    def latin_1_ord(char):
+        n = ord(char)
+        return LATIN_1_EQUIVS.get(n, n)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        return unidecode(
+            bytearray(map(latin_1_ord, html.unescape(str))).decode("utf-8", "ignore")
+        )
+
+
+def parse_start_datetime(date_str):
+    m = arg_date_pattern.match(date_str)
+    if not m:
+        raise ValueError("wrong date format")
+    logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}")
+    return datetime(
+        int(m.group(3)) if m.group(3) else 1,
+        int(m.group(2)) if m.group(2) else 1,
+        int(m.group(1)),
+        0,
+        0,
+        0,
+    )
+
+
+def parse_end_datetime(date_str):
+    m = arg_date_pattern.match(date_str)
+    if not m:
+        raise ValueError("wrong date format")
+    logger.info(f"{m=} {m.group(1)=} {m.group(2)=} {m.group(3)=}")
+    if m.group(2):
+        if m.group(3):
+            return datetime(
+                int(m.group(3)), int(m.group(2)), int(m.group(1)), 23, 59, 59
+            )
+        else:
+            return (
+                datetime(int(m.group(1)), int(m.group(2)), 1) + timedelta(days=31),
+                23,
+                59,
+                59,
+            ).replace(day=1) - timedelta(days=1)
+    return datetime(int(m.group(1)), 12, 31, 23, 59, 59)
+
+
+def retrieve_authors_from_ml(url, start_date, end_date):
+    posts = []
+    logger.info(f"Retrieving data from {url=}.")
+    r = requests.get(url)
+    if r.status_code == 404:
+        return posts
+
+    author = None
+    for line in r.text.splitlines():
+        author_match = author_pattern.match(line)
+        if author_match:
+            # needs multiple passes to work
+            author = decode_broken_html(author_match.group(1))
+        else:
+            date_pattern_match = date_pattern.match(line)
+            if author and date_pattern_match:
+                post_date = datetime.strptime(
+                    date_pattern_match.group(1), "%Y-%m-%d %H:%M:%S"
+                )
+                if start_date <= post_date and post_date <= end_date:
+                    posts.append(PostingData(name=author, post_time=post_date))
+    return posts
+
+
+def retrieve_authors(start_date, end_date):
+    logger.info(f"retrieve_authors from {start_date=} to {end_date=}")
+    start_month = datetime(start_date.year, start_date.month, 1)
+    end_month = datetime(end_date.year, end_date.month, 1)
+    authors = []
+    while start_month <= end_month:
+        for ml in ML_STATS_URLS:
+            authors += retrieve_authors_from_ml(
+                ml.format(start_month.year, start_month.month), start_date, end_date
+            )
+        start_month = start_month + relativedelta(months=+1)
+    PostingData.objects.filter(
+        post_time__gte=start_date, post_time__lte=end_date
+    ).delete()
+    PostingData.objects.bulk_create(authors)
+
+
+@click.command()
+@click.option("--start_date", is_flag=False, help="Start Date", default=None)
+@click.option("--end_date", is_flag=False, help="End Date", default=None)
+def command(start_date, end_date):
+    logger.info(f"Starting import_ml_counts {start_date=} {end_date=}")
+    start_date = (
+        parse_start_datetime(start_date) if start_date else datetime(1998, 11, 11)
+    )
+    logger.info(f"{start_date=}")
+    end_date = parse_end_datetime(end_date) if end_date else datetime.now()
+    logger.info(f"{end_date=}")
+    retrieve_authors(start_date, end_date)
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ def __init__(self, should_generate_report: bool = False):`
`82`	`82`	`ReleaseTask("Updating github issues", ["update_issues"]),`
`83`	`83`	`ReleaseTask("Updating slack activity buckets", ["fetch_slack_activity"]),`
`84`	`84`	`ReleaseTask("Updating website statistics", self.update_website_statistics),`
	`85`	`+ ReleaseTask("Importing mailing list counts", ["import_ml_counts"]),`
`85`	`86`	`ReleaseTask("Generating report", self.generate_report),`
`86`	`87`	`]`
`87`	`88`