Skip to content

Commit 2f27007

Browse files
committed
Added --tag-min-threshold functionality that will only import labels that were used on at least X questions. Also added script for deleting all labels from a repo to allow multiple testing of loading labels
1 parent c419893 commit 2f27007

File tree

5 files changed

+759
-3
lines changed

5 files changed

+759
-3
lines changed

utils/stackoverflow/README.md

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,9 @@ python populate_discussion.py --repo OWNER/REPO --category CATEGORY_NAME [option
115115
- `--clean`: Delete all discussions, comments, and labels before import
116116
- `--clean-only`: Delete all discussions, comments, and labels, then exit
117117
- `--clean-category`: Used with --category and --clean or --clean-only to delete discussions in the specified category only
118+
- `--api-delay`: Minimum seconds between API calls (default: 1.0)
119+
- `--ignore-tags`: List of tags to ignore (space-separated). Questions tagged with these tag(s) will not be processed.
120+
- `--tag-min-threshold`: Minimum number of questions a tag must be associated with to be considered for label creation (default: 1)
118121

119122
#### Example
120123
```
@@ -235,3 +238,76 @@ Use the extracted JSON file as input to retry failed questions:
235238
```bash
236239
python populate_discussion.py --repo owner/repo --category "Q&A" --questions-file retry_questions.json
237240
```
241+
242+
## Label Management (delete_all_labels.py)
243+
A utility script for deleting all labels in a GitHub repository. This tool is particularly useful for cleaning up repositories after Stack Overflow migrations or when you need to reset repository labels completely.
244+
245+
### Requirements
246+
* Python 3.x
247+
* Dependencies listed in requirements.txt
248+
* GitHub App with appropriate permissions (Contents, Metadata)
249+
250+
### Setup
251+
1. Install the required dependencies:
252+
```
253+
pip install -r requirements.txt
254+
```
255+
2. Set up GitHub App authentication by setting these environment variables:
256+
```
257+
export GHD_INSTALLATION_ID=your_installation_id
258+
export GHD_APP_ID=your_github_app_id
259+
export GHD_PRIVATE_KEY=/path/to/your/private-key.pem
260+
```
261+
262+
### Usage
263+
```
264+
python delete_all_labels.py --repo OWNER/REPO [options]
265+
```
266+
267+
#### Parameters
268+
- `--repo` (required): GitHub repository in format owner/name
269+
- `--api-delay`: Minimum seconds between API calls (default: 1.0)
270+
- `--dry-run`: Show what would be deleted without actually deleting
271+
- `--force`: Skip confirmation prompt
272+
273+
#### Examples
274+
275+
**Preview what would be deleted (recommended first step):**
276+
```bash
277+
python delete_all_labels.py --repo bcgov/developer-experience-team --dry-run
278+
```
279+
280+
**Delete all labels with confirmation:**
281+
```bash
282+
python delete_all_labels.py --repo bcgov/developer-experience-team
283+
```
284+
285+
**Delete all labels without confirmation (for living dangerously):**
286+
```bash
287+
python delete_all_labels.py --repo bcgov/developer-experience-team --force
288+
```
289+
290+
**Delete with custom API delay:**
291+
```bash
292+
python delete_all_labels.py --repo bcgov/developer-experience-team --api-delay 2.0
293+
```
294+
295+
296+
### Output
297+
The script provides detailed feedback including:
298+
- Total number of labels found
299+
- Preview of labels to be deleted (first 10 shown)
300+
- Progress updates during deletion
301+
- Summary of successful and failed deletions
302+
- All operations logged to `delete_labels.log`
303+
304+
### Safety Warnings
305+
⚠️ **DESTRUCTIVE OPERATION**: This script permanently deletes all labels from the specified repository. This action cannot be undone.
306+
307+
**Best Practices:**
308+
1. Always run with `--dry-run` first to preview changes
309+
2. Ensure you have backups or can recreate labels if needed
310+
3. Use `--force` only in automated scripts where confirmation isn't possible
311+
4. Monitor the logs for any failures during bulk operations
312+
313+
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import logging
2+
import argparse
3+
from github.GithubException import GithubException
4+
from populate_discussion_helpers import RateLimiter, GitHubAuthManager, GraphQLHelper
5+
from typing import Dict, List, Any
6+
7+
8+
# Get logger for this module
9+
logger = logging.getLogger(__name__)
10+
11+
12+
def setup_logging():
13+
root_logger = logging.getLogger()
14+
15+
# Avoid adding duplicate handlers
16+
if root_logger.hasHandlers():
17+
return
18+
19+
root_logger.setLevel(logging.INFO)
20+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21+
logger_file_handler = logging.FileHandler('delete_labels.log')
22+
logger_file_handler.setFormatter(formatter)
23+
logger_console_handler = logging.StreamHandler()
24+
logger_console_handler.setFormatter(formatter)
25+
root_logger.addHandler(logger_file_handler)
26+
root_logger.addHandler(logger_console_handler)
27+
28+
def delete_labels(github_graphql: GraphQLHelper, labels_to_delete: List[Dict[str, Any]], dry_run: bool):
29+
if not labels_to_delete:
30+
logger.info("No labels found to delete")
31+
return
32+
33+
successful_deletions = 0
34+
failed_deletions = 0
35+
36+
for label in labels_to_delete:
37+
mutation = """
38+
mutation($labelId: ID!) {
39+
deleteLabel(input: {id: $labelId}) {
40+
clientMutationId
41+
}
42+
}
43+
"""
44+
variables = {
45+
'labelId': label['id']
46+
}
47+
if dry_run:
48+
logger.info(f"Dry run: Would delete label: '{label['name']}' (ID: {label['id']})")
49+
successful_deletions += 1
50+
else:
51+
try:
52+
github_graphql.github_graphql_request(mutation, variables)
53+
logger.info(f"Deleted label: '{label['name']}'")
54+
successful_deletions += 1
55+
except Exception as e:
56+
logger.error(f"Failed to delete label '{label['name']}': {e}")
57+
failed_deletions += 1
58+
59+
action = "Would delete" if dry_run else "Deleted"
60+
logger.info(f"Summary: {action} {successful_deletions} labels successfully")
61+
if failed_deletions > 0:
62+
logger.warning(f"Failed to delete {failed_deletions} labels")
63+
64+
def get_all_labels(github_graphql: GraphQLHelper, owner: str, name: str) -> List[Dict[str, Any]]:
65+
"""Retrieve all labels from the specified GitHub repository."""
66+
all_labels = []
67+
has_next_page = True
68+
end_cursor = None
69+
70+
logger.info(f"Fetching labels from repository {owner}/{name}...")
71+
72+
while has_next_page:
73+
query = """
74+
query($owner: String!, $name: String!, $after: String) {
75+
repository(owner: $owner, name: $name) {
76+
labels(first: 100, after: $after) {
77+
nodes {
78+
name
79+
id
80+
}
81+
pageInfo {
82+
hasNextPage
83+
endCursor
84+
}
85+
}
86+
}
87+
}
88+
"""
89+
variables = {'owner': owner, 'name': name, 'after': end_cursor}
90+
91+
try:
92+
data = github_graphql.github_graphql_request(query, variables)
93+
label_data = data['repository']['labels']
94+
all_labels.extend(label_data['nodes'])
95+
has_next_page = label_data['pageInfo']['hasNextPage']
96+
end_cursor = label_data['pageInfo']['endCursor']
97+
except Exception as e:
98+
logger.error(f"Failed to fetch labels: {e}")
99+
raise
100+
101+
logger.info(f"Found {len(all_labels)} labels")
102+
return all_labels
103+
104+
105+
def main():
106+
setup_logging()
107+
108+
parser = argparse.ArgumentParser(description='Delete labels for given GitHub Discussions repo')
109+
parser.add_argument('--repo', required=True, help='Repository in format owner/name')
110+
parser.add_argument('--api-delay', type=float, default=1.0, help='Minimum seconds between API calls (default: 1.0)')
111+
parser.add_argument('--dry-run', action='store_true', help='Show what would be deleted without actually deleting')
112+
parser.add_argument('--force', action='store_true', help='Skip confirmation prompt')
113+
args = parser.parse_args()
114+
115+
116+
if args.api_delay < 0:
117+
logger.warning("Negative API delay specified, defaulting to 1.0 seconds")
118+
args.api_delay = 1.0
119+
120+
# Set the global API delay based on user preference
121+
rate_limiter = RateLimiter(min_interval=args.api_delay)
122+
logger.info(f"Using API delay of {args.api_delay} seconds between requests")
123+
124+
# Initialize GitHub authentication
125+
github_auth_manager = GitHubAuthManager()
126+
github_auth_manager.initialize()
127+
128+
github_graphql = GraphQLHelper(github_auth_manager, rate_limiter)
129+
130+
repo_parts = args.repo.split('/')
131+
if len(repo_parts) != 2:
132+
raise ValueError("Repository must be in format 'owner/name'")
133+
134+
owner, name = repo_parts
135+
repo = github_auth_manager.get_client().get_repo(f"{owner}/{name}")
136+
137+
logger.info(f"Deleting labels in repo '{repo.full_name}'")
138+
logger.info(f"Dry run mode: {args.dry_run}")
139+
140+
# Get all labels from the repository
141+
labels_to_delete = get_all_labels(github_graphql, owner, name)
142+
143+
if not labels_to_delete:
144+
logger.info("No labels found in repository")
145+
return
146+
147+
# Ask for confirmation unless it's a dry run or force flag is used
148+
if not args.dry_run and not args.force:
149+
print(f"\nWARNING: This will permanently delete {len(labels_to_delete)} labels from {repo.full_name}")
150+
print("Labels to be deleted:")
151+
for label in labels_to_delete[:10]: # Show first 10
152+
print(f" - {label['name']}")
153+
if len(labels_to_delete) > 10:
154+
print(f" ... and {len(labels_to_delete) - 10} more")
155+
156+
response = input("\nAre you sure you want to proceed? (yes/no): ").lower().strip()
157+
if response not in ['yes', 'y']:
158+
logger.info("Operation cancelled by user")
159+
return
160+
161+
# Delete all labels from the repository
162+
delete_labels(github_graphql, labels_to_delete, args.dry_run)
163+
logger.info("Label deletion process completed")
164+
165+
166+
if __name__ == '__main__':
167+
main()

utils/stackoverflow/populate_discussion.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def find_discussion_by_title(github_graphql, owner: str, name: str, title: str,
284284
return None
285285

286286
def clean_repo_discussions(github_graphql, owner: str, name: str, category: Optional[Category] = None):
287-
"""Delete all discussions, their comments, and remove all labels from the repo. Optionally filter by category."""
287+
"""Delete all discussions, their comments, and unlabel them from the repo. Optionally filter by category."""
288288

289289
if not category:
290290
logger.warning("Cleaning all discussions, comments, and labels from the repository!")
@@ -463,9 +463,21 @@ def main():
463463
parser.add_argument('--ignore-tags',
464464
type=str,
465465
nargs='+',
466-
help='List of tags to ignore (space-separated)')
466+
help='List of tags to ignore (space-separated). Questions tagged with these tag(s) will not be processed.')
467+
parser.add_argument('--tag-min-threshold',
468+
type=int,
469+
default=1,
470+
help='Minimum number of questions a tag must be associated with to be considered for label creation (default: 1)')
467471
args = parser.parse_args()
468472

473+
if args.tag_min_threshold < 0:
474+
logger.warning("Negative tag minimum threshold specified, defaulting to 1")
475+
args.tag_min_threshold = 1
476+
477+
if args.api_delay < 0:
478+
logger.warning("Negative API delay specified, defaulting to 1.0 seconds")
479+
args.api_delay = 1.0
480+
469481
# Set the global API delay based on user preference
470482
rate_limiter = RateLimiter(min_interval=args.api_delay)
471483
logger.info(f"Using API delay of {args.api_delay} seconds between requests")
@@ -506,6 +518,10 @@ def main():
506518
# Load data
507519
questions = load_json(args.questions_file)
508520
tags_data = load_json(args.tags_file)
521+
522+
tags_under_threshold = get_tags_under_threshold(args.tag_min_threshold, tags_data)
523+
524+
tags_data = get_tags_at_or_above_threshold(args.tag_min_threshold, tags_data)
509525

510526
# Get or create tags as labels
511527
existing_labels = get_labels(repo)
@@ -541,6 +557,8 @@ def main():
541557
if tags_to_ignore_helper.should_ignore(tags):
542558
logger.info(f"Skipping question {question.get('question_id', 'Unknown ID')} - {title} due to its tags containing ignored tag(s): {', '.join(tags)}")
543559
continue
560+
561+
tags = remove_tags_under_threshold(tags_under_threshold, tags)
544562

545563
body = question.get('body', '')
546564

@@ -704,6 +722,42 @@ def main():
704722
logger.error(f"Error processing question_id {question_id} question #{i+1}: {e}")
705723
continue
706724

725+
def remove_tags_under_threshold(tags_under_threshold: List[str], tags: List[str]) -> List[str]:
726+
"""Remove tags that are under the threshold from the given tags list.
727+
728+
Args:
729+
tags_under_threshold: List of tag names that are under threshold
730+
tags: List of tag names to filter
731+
732+
Returns:
733+
List of tag names with under-threshold tags removed
734+
"""
735+
return [tag for tag in tags if tag not in tags_under_threshold]
736+
737+
def get_tags_under_threshold(min_threshold: int, tags_data: List[Dict[str, Any]]) -> List[str]:
738+
"""Get tag names for tags with count below the minimum threshold.
739+
740+
Args:
741+
min_threshold: Minimum count threshold
742+
tags_data: List of tag dictionaries from tags.json
743+
744+
Returns:
745+
List of tag names (strings) for tags with count < min_threshold
746+
"""
747+
return [tag['name'] for tag in tags_data if tag.get('count', 0) < min_threshold]
748+
749+
def get_tags_at_or_above_threshold(min_threshold: int, tags_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
750+
"""Get tag objects for tags with count at or above the minimum threshold.
751+
752+
Args:
753+
min_threshold: Minimum count threshold
754+
tags_data: List of tag dictionaries from tags.json
755+
756+
Returns:
757+
List of tag dictionaries for tags with count >= min_threshold
758+
"""
759+
return [tag for tag in tags_data if tag.get('count', 0) >= min_threshold]
760+
707761
def get_readable_date(the_date):
708762
"""Convert creation_date to a readable string format."""
709763
if the_date:

utils/stackoverflow/so_explore.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@
1919
# dd.sql("select tags.name, tags.count, age(to_timestamp(tags.last_activity_date)) from './tags.json' as tags order by tags.last_activity_date desc, tags.count desc").show()
2020
# dd.sql("select questions.question_id, questions.title, from './discussions_to_add.json' as questions where list_contains(questions.tags, 'openshift')").show()
2121
# dd.sql("select questions.question_id, questions.title, questions.tags from './discussions_to_add.json' as questions order by questions.question_id").show()
22+
# dd.sql("select tags.name, tags.count, age(to_timestamp(tags.last_activity_date)) from './tags.json' as tags where tags.count < 2 order by tags.last_activity_date desc, tags.count desc").show()

0 commit comments

Comments
 (0)