PRAISELab-PicusLab · hadimobini00-ship-it · May 30, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/advanced_standardized_data.csv b/advanced_standardized_data.csv
diff --git a/advanced_standardized_data.xlsx b/advanced_standardized_data.xlsx
diff --git a/api_retriever.py b/api_retriever.py
@@ -0,0 +1,21 @@
+import requests
+import pandas as pd
+
+
+def fetch_from_openalex(query):
+    """
+    Queries OpenAlex for a given search string and returns a DataFrame.
+    Handles basic API interaction.
+    """
+    base_url = "https://api.openalex.org/works"
+    params = {"search": query, "per-page": 20}
+
+    print(f"Fetching data for: {query}...")
+    response = requests.get(base_url, params=params)
+
+    if response.status_code == 200:
+        data = response.json().get('results', [])
+        return pd.DataFrame(data)
+    else:
+        print(f"Error fetching data: {response.status_code}")
+        return pd.DataFrame()
diff --git a/app.py b/app.py
diff --git a/csv_excel.py b/csv_excel.py
@@ -0,0 +1,3 @@
+import pandas as pd
+df = pd.read_csv("advanced_standardized_data.csv", sep=';')
+df.to_excel("advanced_standardized_data.xlsx", index=False)
diff --git a/dispatcher.py b/dispatcher.py
@@ -0,0 +1,189 @@
+from mapping import FIELD_MAPPINGS
+import ast
+
+def extract_authors(row):
+    """Extract list of author names from authorships column."""
+    authorships = row.get('authorships')
+    if isinstance(authorships, str):
+        try:
+            authorships = ast.literal_eval(authorships)
+        except:
+            return []
+    if not isinstance(authorships, list):
+        return []
+    authors = []
+    for auth in authorships:
+        if isinstance(auth, dict):
+            author_info = auth.get('author')
+            if isinstance(author_info, dict):
+                name = author_info.get('display_name')
+                if name:
+                    authors.append(name)
+    return authors
+
+def extract_cited_references(row):
+    """Extract list of cited reference IDs from referenced_works column."""
+    refs = row.get('referenced_works')
+    if isinstance(refs, str):
+        try:
+            refs = ast.literal_eval(refs)
+        except:
+            return []
+    if not isinstance(refs, list):
+        return []
+    return refs
+
+def extract_source(row):
+    """Extract source/journal name from primary_location."""
+    primary_location = row.get('primary_location')
+    if isinstance(primary_location, str):
+        try:
+            primary_location = ast.literal_eval(primary_location)
+        except:
+            return ''
+    if not isinstance(primary_location, dict):
+        return ''
+    source = primary_location.get('source', {})
+    if isinstance(source, dict):
+        return source.get('display_name', '')
+    return ''
+
+def extract_keywords(row):
+    """Extract list of keyword strings from keywords column."""
+    keywords = row.get('keywords')
+    if isinstance(keywords, str):
+        try:
+            keywords = ast.literal_eval(keywords)
+        except:
+            return []
+    if not isinstance(keywords, list):
+        return []
+    result = []
+    for kw in keywords:
+        if isinstance(kw, dict):
+            name = kw.get('display_name')
+            if name:
+                result.append(name)
+    return result
+
+def extract_concepts(row):
+    """Extract list of concept display names from concepts column."""
+    concepts = row.get('concepts')
+    if isinstance(concepts, str):
+        try:
+            concepts = ast.literal_eval(concepts)
+        except:
+            return []
+    if not isinstance(concepts, list):
+        return []
+    result = []
+    for cpt in concepts:
+        if isinstance(cpt, dict):
+            name = cpt.get('display_name')
+            if name:
+                result.append(name)
+    return result
+
+def extract_affiliations(row):
+    """Extract list of affiliation names from authorships column."""
+    authorships = row.get('authorships')
+    if isinstance(authorships, str):
+        try:
+            authorships = ast.literal_eval(authorships)
+        except:
+            return []
+    if not isinstance(authorships, list):
+        return []
+    affiliations = set()
+    for auth in authorships:
+        if not isinstance(auth, dict):
+            continue
+        institutions = auth.get('institutions', [])
+        for inst in institutions:
+            if isinstance(inst, dict):
+                name = inst.get('display_name')
+                if name:
+                    affiliations.add(name)
+    return list(affiliations)
+
+def extract_abstract(row):
+    """Reconstruct abstract from abstract_inverted_index dict."""
+    inverted = row.get('abstract_inverted_index')
+    if isinstance(inverted, str):
+        try:
+            inverted = ast.literal_eval(inverted)
+        except:
+            return ''
+    if not isinstance(inverted, dict):
+        return ''
+    max_pos = 0
+    for positions in inverted.values():
+        if positions:
+            max_pos = max(max_pos, max(positions))
+    words = [''] * (max_pos + 1)
+    for word, positions in inverted.items():
+        for pos in positions:
+            words[pos] = word
+    return ' '.join(words).strip()
+
+def dispatch_and_map(df, source_name):
+    if source_name not in FIELD_MAPPINGS:
+        raise ValueError(f"Source '{source_name}' not supported. Check mapping.py.")
+
+    mapping = FIELD_MAPPINGS[source_name]
+
+    # Extract authors if needed (AU)
+    if 'AU' in mapping and mapping['AU'] == 'authorships':
+        print("Extracting authors...")
+        df['AU'] = df.apply(extract_authors, axis=1)
+
+    # Extract cited references if needed (CR)
+    if 'CR' in mapping and mapping['CR'] == 'referenced_works':
+        print("Extracting cited references...")
+        df['CR'] = df.apply(extract_cited_references, axis=1)
+
+    # Extract source if needed (SO)
+    if 'SO' in mapping and mapping['SO'] == 'primary_location.source.display_name':
+        print("Extracting source...")
+        df['SO'] = df.apply(extract_source, axis=1)
+
+    # Extract keywords if needed (DE)
+    if 'DE' in mapping and mapping['DE'] == 'keywords':
+        print("Extracting keywords...")
+        df['DE'] = df.apply(extract_keywords, axis=1)
+
+    # Extract concepts if needed (ID)
+    if 'ID' in mapping and mapping['ID'] == 'concepts':
+        print("Extracting concepts...")
+        df['ID'] = df.apply(extract_concepts, axis=1)
+
+    # Extract affiliations if needed (C1)
+    if 'C1' in mapping and mapping['C1'] == 'authorships':
+        print("Extracting affiliations...")
+        df['C1'] = df.apply(extract_affiliations, axis=1)
+
+    # Extract abstract if needed (AB)
+    if 'AB' in mapping and mapping['AB'] == 'abstract_inverted_index':
+        print("Extracting abstract...")
+        df['AB'] = df.apply(extract_abstract, axis=1)
+
+    # Rename columns (skip the ones we already created)
+    rename_dict = {}
+    for target, source in mapping.items():
+        if target in ['AU', 'CR', 'SO', 'DE', 'ID', 'C1', 'AB']:
+            continue  # already handled
+        if source in df.columns:
+            rename_dict[source] = target
+
+    df = df.rename(columns=rename_dict)
+
+    # Keep only the required WoS columns (as per exam Table 4.2)
+    required_columns = [
+        'UT', 'DI', 'PMID', 'TI', 'SO', 'JI', 'PY', 'DT', 'LA', 'TC',
+        'AU', 'AF', 'C1', 'RP', 'CR', 'DE', 'ID', 'AB', 'VL', 'IS', 'BP', 'EP', 'SR'
+    ]
+    # Keep only columns that actually exist in the DataFrame
+    existing_cols = [col for col in required_columns if col in df.columns]
+    df = df[existing_cols]
+
+    return df
diff --git a/main.py b/main.py
@@ -0,0 +1,32 @@
+from dispatcher import dispatch_and_map
+from validator import enforce_types
+from api_retriever import fetch_from_openalex
+
+
+def run_advanced_etl(query):
+    # 1. EXTRACT: Fetch from OpenAlex API
+    print(f"Starting Advanced ETL for: '{query}'")
+    raw_df = fetch_from_openalex(query)
+
+    if raw_df.empty:
+        print("No data found.")
+        return
+
+    # 2. TRANSFORM: Dispatch & Map
+    # Note: We specify 'openalex' as the source
+    df = dispatch_and_map(raw_df, 'openalex')
+
+    # 3. TRANSFORM: Enforce Types
+    df = enforce_types(df)
+
+    # 4. LOAD: Export
+    output_path = "advanced_standardized_data.csv"
+    df.to_csv(output_path, sep=';', index=False)
+    print(f"Pipeline complete! File saved: {output_path}")
+    print("First 5 rows:")
+    print(df.head())  # Preview the normalized result
+    print("\nAll column names in the final DataFrame:")
+    print(df.columns.tolist())
+
+# To run it, uncomment the line below:
+run_advanced_etl("machine learning")
diff --git a/mapping.py b/mapping.py
@@ -0,0 +1,18 @@
+FIELD_MAPPINGS = {
+    "openalex": {
+        "UT": "id",
+        "TI": "title",
+        "PY": "publication_year",
+        "TC": "cited_by_count",
+        "DI": "doi",
+        "AU": "authorships",
+        "DT": "type",
+        "LA": "language",
+        "SO": "primary_location.source.display_name",
+        "CR": "referenced_works",
+        "DE": "keywords",
+        "ID": "concepts",
+        "C1": "authorships",
+        "AB": "abstract_inverted_index"
+    }
+}
diff --git a/test_all_functions.py b/test_all_functions.py
@@ -0,0 +1,46 @@
+import pandas as pd
+import sys
+import os
+import traceback
+
+# Add the path to the functions folder (adjust to your actual path)
+functions_path = r"C:\Users\179518\Desktop\All Files\Data Science\Second semester\Hardware & Software_2nd_semester\bibliometrix-python\functions"
+sys.path.append(functions_path)
+
+# Import functions
+from functions.get_annualproduction import get_annual_production
+from functions.get_averagecitations import get_average_citations
+from functions.get_relevantsources import get_relevant_sources
+from functions.get_relevantauthors import get_relevant_authors
+
+# Load your standardized data
+df = pd.read_csv("advanced_standardized_data.csv", sep=';')
+
+# Wrapper class
+class DataWrapper:
+    def __init__(self, df):
+        self.data = df
+    def get(self):
+        return self.data
+
+wrapped = DataWrapper(df)
+
+# Test each function
+tests = [
+    ("Annual Production", get_annual_production, []),
+    ("Average Citations", get_average_citations, []),
+    ("Relevant Sources", get_relevant_sources, [10]),
+    ("Relevant Authors", get_relevant_authors, [10, "n_docs"]),
+]
+
+print("Column names:", df.columns.tolist())
+print("TC NaN count:", df['TC'].isna().sum() if 'TC' in df.columns else "TC missing")
+print("PY NaN count:", df['PY'].isna().sum() if 'PY' in df.columns else "PY missing")
+
+for name, func, args in tests:
+    try:
+        result = func(wrapped, *args)
+        print(f"✅ {name} passed")
+    except Exception as e:
+        print(f"❌ {name} failed: {e}")
+        print(traceback.format_exc())
diff --git a/test_functions.py b/test_functions.py
@@ -0,0 +1,47 @@
+import pandas as pd
+import sys
+import os
+
+# Add the project root to sys.path to ensure local imports work
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Import the corrected function name
+from functions.get_annualproduction import get_annual_production
+
+
+# Create a helper class to mimic the custom data object the function expects
+class DataWrapper:
+    def __init__(self, df):
+        self.data = df
+
+    def get(self):
+        return self.data
+
+
+# 1. Load your standardized data
+file_path = "advanced_standardized_data.csv"
+
+if not os.path.exists(file_path):
+    print(f"Error: {file_path} not found. Run main.py first to generate it.")
+else:
+    df = pd.read_csv(file_path, sep=';')
+
+    # 2. Run the function
+    try:
+        print("Running analysis...")
+
+        # Wrap the DataFrame to satisfy the .get() requirement
+        wrapped_df = DataWrapper(df)
+
+        # Capture both the figure and the data returned by the function
+        fig, pub_data = get_annual_production(wrapped_df)
+
+        print("Function passed!")
+        print("--- First 5 rows of calculated annual production ---")
+        print(pub_data.head())
+
+    except Exception as e:
+        print(f"Function crashed! Error: {e}")
+        import traceback
+
+        traceback.print_exc()
diff --git a/validator.py b/validator.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+
+def enforce_types(df):
+    # 1. Ensure TC is integer
+    if 'TC' in df.columns:
+        df['TC'] = pd.to_numeric(df['TC'], errors='coerce').fillna(0).astype(int)
+
+    # 2. Ensure PY is integer (publication year)
+    if 'PY' in df.columns:
+        df['PY'] = pd.to_numeric(df['PY'], errors='coerce').fillna(0).astype(int)
+
+    # 3. Ensure multi-value fields are lists, preserve existing lists
+    list_columns = ['AU', 'AF', 'C1', 'CR', 'DE', 'ID']
+    for col in list_columns:
+        if col in df.columns:
+            df[col] = df[col].apply(
+                lambda x: x if isinstance(x, list) else (x.split(';') if isinstance(x, str) else []))
+
+    # 4. Compute SR if needed (your existing SR code here)
+    # ... (keep your SR calculation code)
+
+    return df