-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcleanup_dashes.py
More file actions
158 lines (125 loc) · 5.28 KB
/
cleanup_dashes.py
File metadata and controls
158 lines (125 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python3
"""
Cleanup script: remove em/en dashes and AI patterns from all draft articles.
Handles paired dashes (parenthetical), definition dashes, and single dashes.
Preserves code blocks and table formatting.
"""
import re, os, glob
DRAFTS_DIR = os.path.join(os.path.dirname(__file__), "content", "drafts")
AI_PATTERNS = {
r"\bfundamentally\b": {
"fundamentally different approach": "structurally different approach",
"fundamentally different": "structurally different",
"fundamentally": "structurally",
"changes fundamentally": "changes completely",
"is fundamentally different": "differs structurally",
},
r"\bessentially\b": {
"is essentially": "is effectively",
"essentially a": "effectively a",
"essentially your": "effectively your",
},
r"\blandscape\b": {
"regulatory landscape": "regulatory environment",
"The L2 Landscape": "The L2 Ecosystem",
"Landscape": "Ecosystem",
"landscape": "environment",
},
r"This means that ": "",
r"This ensures that ": "",
}
def fix_dashes_in_line(line):
"""Replace em/en dashes with appropriate punctuation."""
if line.strip().startswith("```") or line.strip().startswith("|"):
if line.strip().startswith("|"):
line = line.replace(" — ", " / ")
line = line.replace("—", " / ")
line = line.replace(" – ", " / ")
line = line.replace("–", " / ")
return line
# Paired dashes: "X — Y — Z" → "X (Y) Z"
paired = re.findall(r' — ([^—\n]+?) — ', line)
if paired:
line = re.sub(r' — ([^—\n]+?) — ', r' (\1) ', line)
# Definition style: "**Term** — definition" → "**Term**: definition"
line = re.sub(r'(\*\*[^*]+\*\*) — ', r'\1: ', line)
# Remaining em dashes with spaces
line = line.replace(" — ", ", ")
# Em dashes without spaces (appositive)
line = re.sub(r'(\w)—(\w)', r'\1, \2', line)
line = re.sub(r'(\*)—(\w)', r'\1, \2', line)
line = re.sub(r'(\w)—(\*)', r'\1, \2', line)
line = re.sub(r'(\))—', r'\1, ', line)
line = re.sub(r'—(\()', r', \1', line)
# Catch any remaining em dashes
line = line.replace("—", ", ")
# En dashes with spaces
line = line.replace(" – ", ", ")
line = line.replace("–", ", ")
# Clean up double commas or comma-space-comma
line = re.sub(r',\s*,', ',', line)
# Clean up ", ,"
line = line.replace(", ,", ",")
return line
def fix_ai_patterns(content):
"""Replace AI-sounding phrases with natural alternatives."""
# "This means that" at start of sentence
content = re.sub(r'This means that ', '', content)
content = re.sub(r'This ensures that ', '', content)
# "fundamentally"
content = content.replace("changes fundamentally", "changes completely")
content = content.replace("fundamentally different approach", "structurally different approach")
content = content.replace("fundamentally different", "structurally different")
content = content.replace("A fundamentally different", "A structurally different")
content = content.replace("is fundamentally", "is structurally")
content = content.replace("fundamentally", "structurally")
# "essentially"
content = content.replace("is essentially your", "is effectively your")
content = content.replace("is essentially a", "is effectively a")
content = content.replace("essentially", "effectively")
# "landscape"
content = content.replace("regulatory landscape", "regulatory environment")
content = content.replace("The L2 Landscape", "The L2 Ecosystem")
content = content.replace("L2 landscape", "L2 ecosystem")
content = content.replace("DeFi landscape", "DeFi ecosystem")
content = content.replace("landscape", "environment")
return content
def process_file(filepath):
with open(filepath, "r") as f:
content = f.read()
lines = content.split("\n")
in_code_block = False
fixed_lines = []
for line in lines:
if line.strip().startswith("```"):
in_code_block = not in_code_block
fixed_lines.append(line)
continue
if in_code_block:
fixed_lines.append(line)
continue
fixed_lines.append(fix_dashes_in_line(line))
content = "\n".join(fixed_lines)
content = fix_ai_patterns(content)
with open(filepath, "w") as f:
f.write(content)
return filepath
def count_issues(filepath):
with open(filepath, "r") as f:
content = f.read()
dashes = content.count("—") + content.count("–")
ai_hits = 0
for word in ["fundamentally", "essentially", "landscape", "This means that", "This ensures that"]:
ai_hits += content.count(word)
return dashes, ai_hits
if __name__ == "__main__":
files = sorted(glob.glob(os.path.join(DRAFTS_DIR, "*.md")))
files = [f for f in files if "SAGE_AUDIT" not in f]
print(f"Processing {len(files)} articles...\n")
for f in files:
d_before, a_before = count_issues(f)
process_file(f)
d_after, a_after = count_issues(f)
name = os.path.basename(f)
print(f" {name}: dashes {d_before}→{d_after}, AI patterns {a_before}→{a_after}")
print("\nDone. Verify a few files manually.")