Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 175 additions & 37 deletions gfi/test_data.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,196 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Data file validation tests for repository and labels data."""

import json
import os
import unittest
from collections import Counter
from typing import Any, Dict

import toml

DATA_FILE_PATH = "data/repositories.toml"
LABELS_FILE_PATH = "data/labels.json"


def _get_data_from_toml(file_path):
with open(file_path, "r") as file_desc:
return toml.load(file_desc)
def _load_data_file(file_path: str, file_format: str = "auto") -> Dict[str, Any]:
"""
Load data from a file (TOML or JSON format).

Args:
file_path: Path to the data file to load.
file_format: File format type - "auto", "toml", or "json".
If "auto", determines format from file extension.

Returns:
Parsed data as a dictionary.

Raises:
FileNotFoundError: If the file does not exist.
ValueError: If file format is unsupported.
toml.TomlDecodeError: If TOML file is invalid.
json.JSONDecodeError: If JSON file is invalid.
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"Data file not found: {file_path}")

# Determine file format if not explicitly specified
if file_format == "auto":
if file_path.endswith(".toml"):
file_format = "toml"
elif file_path.endswith(".json"):
file_format = "json"
else:
raise ValueError(
f"Unable to determine file format for {file_path}. "
"Specify format explicitly or use .toml/.json extension."
)

def _get_data_from_json(file_path):
with open(file_path, "r") as file_desc:
return json.load(file_desc)
try:
with open(file_path, "r", encoding="utf-8") as file_desc:
if file_format == "toml":
return toml.load(file_desc)
elif file_format == "json":
return json.load(file_desc)
else:
raise ValueError(f"Unsupported file format: {file_format}")
except (IOError, OSError) as e:
raise IOError(f"Error reading file {file_path}: {e}") from e


class TestDataSanity(unittest.TestCase):
"""Test for sanity of the data file."""

@staticmethod
def test_data_file_exists():
"""Verify that the data file exists."""
assert os.path.exists(DATA_FILE_PATH)

@staticmethod
def test_labels_file_exists():
"""Verify that the labels file exists."""
assert os.path.exists(LABELS_FILE_PATH)

@staticmethod
def test_data_file_sane():
"""Verify that the file is a valid TOML with required data."""
data = _get_data_from_toml(DATA_FILE_PATH)
assert "repositories" in data

@staticmethod
def test_labels_file_sane():
"""Verify that the labels file is a valid JSON"""
data = _get_data_from_json(LABELS_FILE_PATH)
assert "labels" in data

@staticmethod
def test_no_duplicates():
"""Verify that all entries are unique."""
data = _get_data_from_toml(DATA_FILE_PATH)
repos = data.get("repositories", [])
print([item for item, count in Counter(repos).items() if count > 1])
assert len(repos) == len(set(repos))
"""Comprehensive tests for data file integrity and validity."""

def test_data_file_exists(self) -> None:
"""Verify that the repository data file exists."""
self.assertTrue(
os.path.exists(DATA_FILE_PATH),
f"Repository data file not found at {DATA_FILE_PATH}",
)

def test_labels_file_exists(self) -> None:
"""Verify that the labels data file exists."""
self.assertTrue(
os.path.exists(LABELS_FILE_PATH),
f"Labels data file not found at {LABELS_FILE_PATH}",
)

def test_data_file_valid_format(self) -> None:
"""Verify that the repository data file is valid TOML."""
try:
data = _load_data_file(DATA_FILE_PATH, "toml")
self.assertIsInstance(data, dict, "TOML file should parse to a dictionary")
except toml.TomlDecodeError as e:
self.fail(f"Repository data file is invalid TOML: {e}")
except IOError as e:
self.fail(f"Could not read repository data file: {e}")

def test_labels_file_valid_format(self) -> None:
"""Verify that the labels file is valid JSON."""
try:
data = _load_data_file(LABELS_FILE_PATH, "json")
self.assertIsInstance(data, dict, "JSON file should parse to a dictionary")
except json.JSONDecodeError as e:
self.fail(f"Labels file is invalid JSON: {e}")
except IOError as e:
self.fail(f"Could not read labels file: {e}")

def test_data_file_has_required_structure(self) -> None:
"""Verify that the repository data file has the required 'repositories' key."""
try:
data = _load_data_file(DATA_FILE_PATH, "toml")
self.assertIn(
"repositories",
data,
"Repository data file must contain 'repositories' key",
)
self.assertIsInstance(
data["repositories"],
list,
"'repositories' must be a list",
)
except IOError as e:
self.fail(f"Could not validate repository data structure: {e}")

def test_labels_file_has_required_structure(self) -> None:
"""Verify that the labels file has the required 'labels' key."""
try:
data = _load_data_file(LABELS_FILE_PATH, "json")
self.assertIn(
"labels",
data,
"Labels file must contain 'labels' key",
)
self.assertIsInstance(
data["labels"],
list,
"'labels' must be a list",
)
except IOError as e:
self.fail(f"Could not validate labels data structure: {e}")

def test_repositories_have_no_duplicates(self) -> None:
"""Verify that all repository entries are unique."""
try:
data = _load_data_file(DATA_FILE_PATH, "toml")
repos = data.get("repositories", [])

# Find duplicates
duplicates = [
item for item, count in Counter(repos).items() if count > 1
]

self.assertEqual(
len(repos),
len(set(repos)),
f"Found duplicate repositories: {duplicates}",
)
except IOError as e:
self.fail(f"Could not check for duplicate repositories: {e}")

def test_labels_have_no_duplicates(self) -> None:
"""Verify that all label entries are unique."""
try:
data = _load_data_file(LABELS_FILE_PATH, "json")
labels = data.get("labels", [])

# Find duplicates
duplicates = [
item for item, count in Counter(labels).items() if count > 1
]

self.assertEqual(
len(labels),
len(set(labels)),
f"Found duplicate labels: {duplicates}",
)
except IOError as e:
self.fail(f"Could not check for duplicate labels: {e}")

def test_repositories_not_empty(self) -> None:
"""Verify that the repository list is not empty."""
try:
data = _load_data_file(DATA_FILE_PATH, "toml")
repos = data.get("repositories", [])
self.assertTrue(
repos,
"Repository list must not be empty",
)
except IOError as e:
self.fail(f"Could not validate repository list: {e}")

def test_labels_not_empty(self) -> None:
"""Verify that the labels list is not empty."""
try:
data = _load_data_file(LABELS_FILE_PATH, "json")
labels = data.get("labels", [])
self.assertTrue(
labels,
"Labels list must not be empty",
)
except IOError as e:
self.fail(f"Could not validate labels list: {e}")


if __name__ == "__main__":
Expand Down