Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions advanced_standardized_data.csv

Large diffs are not rendered by default.

Binary file added advanced_standardized_data.xlsx
Binary file not shown.
21 changes: 21 additions & 0 deletions api_retriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import requests
import pandas as pd


def fetch_from_openalex(query):
"""
Queries OpenAlex for a given search string and returns a DataFrame.
Handles basic API interaction.
"""
base_url = "https://api.openalex.org/works"
params = {"search": query, "per-page": 20}

print(f"Fetching data for: {query}...")
response = requests.get(base_url, params=params)

if response.status_code == 200:
data = response.json().get('results', [])
return pd.DataFrame(data)
else:
print(f"Error fetching data: {response.status_code}")
return pd.DataFrame()
17,280 changes: 8,640 additions & 8,640 deletions app.py

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions csv_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import pandas as pd
df = pd.read_csv("advanced_standardized_data.csv", sep=';')
df.to_excel("advanced_standardized_data.xlsx", index=False)
189 changes: 189 additions & 0 deletions dispatcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
from mapping import FIELD_MAPPINGS
import ast

def extract_authors(row):
"""Extract list of author names from authorships column."""
authorships = row.get('authorships')
if isinstance(authorships, str):
try:
authorships = ast.literal_eval(authorships)
except:
return []
if not isinstance(authorships, list):
return []
authors = []
for auth in authorships:
if isinstance(auth, dict):
author_info = auth.get('author')
if isinstance(author_info, dict):
name = author_info.get('display_name')
if name:
authors.append(name)
return authors

def extract_cited_references(row):
"""Extract list of cited reference IDs from referenced_works column."""
refs = row.get('referenced_works')
if isinstance(refs, str):
try:
refs = ast.literal_eval(refs)
except:
return []
if not isinstance(refs, list):
return []
return refs

def extract_source(row):
"""Extract source/journal name from primary_location."""
primary_location = row.get('primary_location')
if isinstance(primary_location, str):
try:
primary_location = ast.literal_eval(primary_location)
except:
return ''
if not isinstance(primary_location, dict):
return ''
source = primary_location.get('source', {})
if isinstance(source, dict):
return source.get('display_name', '')
return ''

def extract_keywords(row):
"""Extract list of keyword strings from keywords column."""
keywords = row.get('keywords')
if isinstance(keywords, str):
try:
keywords = ast.literal_eval(keywords)
except:
return []
if not isinstance(keywords, list):
return []
result = []
for kw in keywords:
if isinstance(kw, dict):
name = kw.get('display_name')
if name:
result.append(name)
return result

def extract_concepts(row):
"""Extract list of concept display names from concepts column."""
concepts = row.get('concepts')
if isinstance(concepts, str):
try:
concepts = ast.literal_eval(concepts)
except:
return []
if not isinstance(concepts, list):
return []
result = []
for cpt in concepts:
if isinstance(cpt, dict):
name = cpt.get('display_name')
if name:
result.append(name)
return result

def extract_affiliations(row):
"""Extract list of affiliation names from authorships column."""
authorships = row.get('authorships')
if isinstance(authorships, str):
try:
authorships = ast.literal_eval(authorships)
except:
return []
if not isinstance(authorships, list):
return []
affiliations = set()
for auth in authorships:
if not isinstance(auth, dict):
continue
institutions = auth.get('institutions', [])
for inst in institutions:
if isinstance(inst, dict):
name = inst.get('display_name')
if name:
affiliations.add(name)
return list(affiliations)

def extract_abstract(row):
"""Reconstruct abstract from abstract_inverted_index dict."""
inverted = row.get('abstract_inverted_index')
if isinstance(inverted, str):
try:
inverted = ast.literal_eval(inverted)
except:
return ''
if not isinstance(inverted, dict):
return ''
max_pos = 0
for positions in inverted.values():
if positions:
max_pos = max(max_pos, max(positions))
words = [''] * (max_pos + 1)
for word, positions in inverted.items():
for pos in positions:
words[pos] = word
return ' '.join(words).strip()

def dispatch_and_map(df, source_name):
if source_name not in FIELD_MAPPINGS:
raise ValueError(f"Source '{source_name}' not supported. Check mapping.py.")

mapping = FIELD_MAPPINGS[source_name]

# Extract authors if needed (AU)
if 'AU' in mapping and mapping['AU'] == 'authorships':
print("Extracting authors...")
df['AU'] = df.apply(extract_authors, axis=1)

# Extract cited references if needed (CR)
if 'CR' in mapping and mapping['CR'] == 'referenced_works':
print("Extracting cited references...")
df['CR'] = df.apply(extract_cited_references, axis=1)

# Extract source if needed (SO)
if 'SO' in mapping and mapping['SO'] == 'primary_location.source.display_name':
print("Extracting source...")
df['SO'] = df.apply(extract_source, axis=1)

# Extract keywords if needed (DE)
if 'DE' in mapping and mapping['DE'] == 'keywords':
print("Extracting keywords...")
df['DE'] = df.apply(extract_keywords, axis=1)

# Extract concepts if needed (ID)
if 'ID' in mapping and mapping['ID'] == 'concepts':
print("Extracting concepts...")
df['ID'] = df.apply(extract_concepts, axis=1)

# Extract affiliations if needed (C1)
if 'C1' in mapping and mapping['C1'] == 'authorships':
print("Extracting affiliations...")
df['C1'] = df.apply(extract_affiliations, axis=1)

# Extract abstract if needed (AB)
if 'AB' in mapping and mapping['AB'] == 'abstract_inverted_index':
print("Extracting abstract...")
df['AB'] = df.apply(extract_abstract, axis=1)

# Rename columns (skip the ones we already created)
rename_dict = {}
for target, source in mapping.items():
if target in ['AU', 'CR', 'SO', 'DE', 'ID', 'C1', 'AB']:
continue # already handled
if source in df.columns:
rename_dict[source] = target

df = df.rename(columns=rename_dict)

# Keep only the required WoS columns (as per exam Table 4.2)
required_columns = [
'UT', 'DI', 'PMID', 'TI', 'SO', 'JI', 'PY', 'DT', 'LA', 'TC',
'AU', 'AF', 'C1', 'RP', 'CR', 'DE', 'ID', 'AB', 'VL', 'IS', 'BP', 'EP', 'SR'
]
# Keep only columns that actually exist in the DataFrame
existing_cols = [col for col in required_columns if col in df.columns]
df = df[existing_cols]

return df
32 changes: 32 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from dispatcher import dispatch_and_map
from validator import enforce_types
from api_retriever import fetch_from_openalex


def run_advanced_etl(query):
# 1. EXTRACT: Fetch from OpenAlex API
print(f"Starting Advanced ETL for: '{query}'")
raw_df = fetch_from_openalex(query)

if raw_df.empty:
print("No data found.")
return

# 2. TRANSFORM: Dispatch & Map
# Note: We specify 'openalex' as the source
df = dispatch_and_map(raw_df, 'openalex')

# 3. TRANSFORM: Enforce Types
df = enforce_types(df)

# 4. LOAD: Export
output_path = "advanced_standardized_data.csv"
df.to_csv(output_path, sep=';', index=False)
print(f"Pipeline complete! File saved: {output_path}")
print("First 5 rows:")
print(df.head()) # Preview the normalized result
print("\nAll column names in the final DataFrame:")
print(df.columns.tolist())

# To run it, uncomment the line below:
run_advanced_etl("machine learning")
18 changes: 18 additions & 0 deletions mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FIELD_MAPPINGS = {
"openalex": {
"UT": "id",
"TI": "title",
"PY": "publication_year",
"TC": "cited_by_count",
"DI": "doi",
"AU": "authorships",
"DT": "type",
"LA": "language",
"SO": "primary_location.source.display_name",
"CR": "referenced_works",
"DE": "keywords",
"ID": "concepts",
"C1": "authorships",
"AB": "abstract_inverted_index"
}
}
46 changes: 46 additions & 0 deletions test_all_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pandas as pd
import sys
import os
import traceback

# Add the path to the functions folder (adjust to your actual path)
functions_path = r"C:\Users\179518\Desktop\All Files\Data Science\Second semester\Hardware & Software_2nd_semester\bibliometrix-python\functions"
sys.path.append(functions_path)

# Import functions
from functions.get_annualproduction import get_annual_production
from functions.get_averagecitations import get_average_citations
from functions.get_relevantsources import get_relevant_sources
from functions.get_relevantauthors import get_relevant_authors

# Load your standardized data
df = pd.read_csv("advanced_standardized_data.csv", sep=';')

# Wrapper class
class DataWrapper:
def __init__(self, df):
self.data = df
def get(self):
return self.data

wrapped = DataWrapper(df)

# Test each function
tests = [
("Annual Production", get_annual_production, []),
("Average Citations", get_average_citations, []),
("Relevant Sources", get_relevant_sources, [10]),
("Relevant Authors", get_relevant_authors, [10, "n_docs"]),
]

print("Column names:", df.columns.tolist())
print("TC NaN count:", df['TC'].isna().sum() if 'TC' in df.columns else "TC missing")
print("PY NaN count:", df['PY'].isna().sum() if 'PY' in df.columns else "PY missing")

for name, func, args in tests:
try:
result = func(wrapped, *args)
print(f"✅ {name} passed")
except Exception as e:
print(f"❌ {name} failed: {e}")
print(traceback.format_exc())
47 changes: 47 additions & 0 deletions test_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd
import sys
import os

# Add the project root to sys.path to ensure local imports work
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Import the corrected function name
from functions.get_annualproduction import get_annual_production


# Create a helper class to mimic the custom data object the function expects
class DataWrapper:
def __init__(self, df):
self.data = df

def get(self):
return self.data


# 1. Load your standardized data
file_path = "advanced_standardized_data.csv"

if not os.path.exists(file_path):
print(f"Error: {file_path} not found. Run main.py first to generate it.")
else:
df = pd.read_csv(file_path, sep=';')

# 2. Run the function
try:
print("Running analysis...")

# Wrap the DataFrame to satisfy the .get() requirement
wrapped_df = DataWrapper(df)

# Capture both the figure and the data returned by the function
fig, pub_data = get_annual_production(wrapped_df)

print("Function passed!")
print("--- First 5 rows of calculated annual production ---")
print(pub_data.head())

except Exception as e:
print(f"Function crashed! Error: {e}")
import traceback

traceback.print_exc()
23 changes: 23 additions & 0 deletions validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd


def enforce_types(df):
# 1. Ensure TC is integer
if 'TC' in df.columns:
df['TC'] = pd.to_numeric(df['TC'], errors='coerce').fillna(0).astype(int)

# 2. Ensure PY is integer (publication year)
if 'PY' in df.columns:
df['PY'] = pd.to_numeric(df['PY'], errors='coerce').fillna(0).astype(int)

# 3. Ensure multi-value fields are lists, preserve existing lists
list_columns = ['AU', 'AF', 'C1', 'CR', 'DE', 'ID']
for col in list_columns:
if col in df.columns:
df[col] = df[col].apply(
lambda x: x if isinstance(x, list) else (x.split(';') if isinstance(x, str) else []))

# 4. Compute SR if needed (your existing SR code here)
# ... (keep your SR calculation code)

return df