-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_stats.py
More file actions
executable file
·111 lines (86 loc) · 4.38 KB
/
generate_stats.py
File metadata and controls
executable file
·111 lines (86 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Generate publication statistics from figshare articles CSV.
Outputs a markdown table showing publications per author per year.
"""
import pandas as pd
import sys
import argparse
from pathlib import Path
def generate_statistics(all_csv='figshare_articles_all.csv', dedup_csv='figshare_articles.csv'):
"""
Read the figshare articles CSVs and generate statistics.
Args:
all_csv: CSV file with all publications (includes duplicates for multi-author papers)
dedup_csv: CSV file with deduplicated publications (for calculating true totals)
Returns:
A markdown table string showing statistics.
"""
try:
# Read the per-author CSV file (includes duplicates for multi-author papers)
df_all = pd.read_csv(all_csv)
# Read the deduplicated CSV file (for accurate totals)
df_dedup = pd.read_csv(dedup_csv)
if df_all.empty:
return "No publication data available."
# Ensure we have the required columns
if 'author' not in df_all.columns or 'online_year' not in df_all.columns:
return "Error: Required columns (author, online_year) not found in all articles CSV."
if 'online_year' not in df_dedup.columns:
return "Error: Required column (online_year) not found in deduplicated CSV."
# Group by author and year, count publications per author
stats = df_all.groupby(['author', 'online_year']).size().reset_index(name='count')
# Pivot to get years as columns
pivot = stats.pivot(index='author', columns='online_year', values='count').fillna(0).astype(int)
# Sort columns (years) in descending order (most recent first)
pivot = pivot[sorted(pivot.columns, reverse=True)]
# Calculate total per author (from their individual publications)
pivot['Total'] = pivot.sum(axis=1)
# Sort by total publications (descending)
pivot = pivot.sort_values('Total', ascending=False)
# Calculate actual yearly totals from deduplicated data
dedup_by_year = df_dedup.groupby('online_year').size()
# Generate markdown table
md_lines = ["# Publication Statistics by Author and Year", ""]
md_lines.append(f"**Total Authors:** {len(pivot)}\n")
md_lines.append(f"**Total Publications (deduplicated):** {len(df_dedup)}\n")
md_lines.append("")
# Create table header
headers = ['**Author**', '**Total**'] + [str(year) for year in pivot.columns if year != 'Total']
md_lines.append('| ' + ' | '.join(headers) + ' |')
md_lines.append('| ' + ' | '.join(['---' for _ in headers]) + ' |')
# Create table rows
for author, row in pivot.iterrows():
values = [f"**{author}**", f"**{int(row['Total'])}**"] + [str(int(row[year])) if row[year] > 0 else '-' for year in pivot.columns if year != 'Total']
md_lines.append('| ' + ' | '.join(values) + ' |')
# Add yearly totals row using deduplicated data
year_columns = [year for year in pivot.columns if year != 'Total']
year_totals = ['**Total (unique)**', f"**{len(df_dedup)}**"] + [str(int(dedup_by_year.get(year, 0))) for year in year_columns]
md_lines.append('| ' + ' | '.join(year_totals) + ' |')
return '\n'.join(md_lines)
except FileNotFoundError as e:
return f"Error: File not found - {e.filename}"
except Exception as e:
return f"Error generating statistics: {str(e)}"
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate publication statistics from FigShare articles CSV files.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'--all-csv',
type=str,
default='figshare_articles_all.csv',
help='Path to CSV file with all publications (includes duplicates for multi-author papers)'
)
parser.add_argument(
'--dedup-csv',
type=str,
default='figshare_articles.csv',
help='Path to CSV file with deduplicated publications (for accurate total counts)'
)
args = parser.parse_args()
# Generate and print statistics
stats = generate_statistics(args.all_csv, args.dedup_csv)
print(stats)