import json
import hashlib
from pathlib import Path
from typing import Optional, Dict, List, Set, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.collections import PolyCollection
from scipy.spatial import Voronoi

# Configure display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', None)

# Matplotlib defaults for accessibility (grayscale-friendly)
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=['#333333', '#666666', '#999999', '#CCCCCC'])

# Base paths
NOTEBOOK_DIR = Path(".").resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

# CLDR data paths (committed files in data/CLDR-raw)
CLDR_DIR = PROJECT_ROOT / "data" / "CLDR-raw"
TERRITORY_INFO_PATH = CLDR_DIR / "territoryInfo.json"
LANGUAGE_DATA_PATH = CLDR_DIR / "languageData.json"
TERRITORIES_PATH = CLDR_DIR / "territories.json"

# Fedora stats path (may or may not exist)
FEDORA_STATS_DIR = PROJECT_ROOT / "generated"
DEFAULT_FEDORA_RELEASE = "f43"  # Most recent release to scan

# Region definitions based on CLDR territory containment (UN M49 codes)
# These are hardcoded to avoid dependency on territoryContainment.json
REGION_MAPPING = {
    # Africa (002)
    'Africa': {'DZ', 'EG', 'LY', 'MA', 'SD', 'TN', 'EH', 'BJ', 'BF', 'CV', 'CI', 'GM', 'GH', 'GN', 'GW', 'LR', 'ML', 'MR', 'NE', 'NG', 'SN', 'SL', 'TG', 'AO', 'CM', 'CF', 'TD', 'CG', 'CD', 'GQ', 'GA', 'ST', 'BI', 'KM', 'DJ', 'ER', 'ET', 'KE', 'MG', 'MW', 'MU', 'YT', 'MZ', 'RE', 'RW', 'SC', 'SO', 'SS', 'TZ', 'UG', 'ZM', 'ZW', 'BW', 'SZ', 'LS', 'NA', 'ZA'},
    # Americas (019)
    'Americas': {'AI', 'AG', 'AW', 'BS', 'BB', 'BQ', 'VG', 'KY', 'CU', 'CW', 'DM', 'DO', 'GD', 'GP', 'HT', 'JM', 'MQ', 'MS', 'PR', 'BL', 'KN', 'LC', 'MF', 'PM', 'VC', 'SX', 'TT', 'TC', 'VI', 'BZ', 'CR', 'SV', 'GT', 'HN', 'MX', 'NI', 'PA', 'AR', 'BO', 'BV', 'BR', 'CL', 'CO', 'EC', 'FK', 'GF', 'GY', 'PY', 'PE', 'GS', 'SR', 'UY', 'VE', 'BM', 'CA', 'GL', 'US', 'UM'},
    # Asia (142)
    'Asia': {'KZ', 'KG', 'TJ', 'TM', 'UZ', 'CN', 'HK', 'MO', 'KP', 'JP', 'MN', 'KR', 'TW', 'AF', 'BD', 'BT', 'IN', 'IR', 'MV', 'NP', 'PK', 'LK', 'BN', 'KH', 'ID', 'LA', 'MY', 'MM', 'PH', 'SG', 'TH', 'TL', 'VN', 'AM', 'AZ', 'BH', 'CY', 'GE', 'IQ', 'IL', 'JO', 'KW', 'LB', 'OM', 'PS', 'QA', 'SA', 'SY', 'TR', 'AE', 'YE'},
    # Europe (150)
    'Europe': {'BY', 'BG', 'CZ', 'HU', 'MD', 'PL', 'RO', 'RU', 'SK', 'UA', 'AX', 'DK', 'EE', 'FO', 'FI', 'GG', 'IS', 'IE', 'IM', 'JE', 'LV', 'LT', 'NO', 'SJ', 'SE', 'GB', 'AL', 'AD', 'BA', 'HR', 'GI', 'GR', 'VA', 'IT', 'MT', 'ME', 'MK', 'PT', 'SM', 'RS', 'SI', 'ES', 'AT', 'BE', 'FR', 'DE', 'LI', 'LU', 'MC', 'NL', 'CH'},
    # Oceania (009)
    'Oceania': {'AU', 'CX', 'CC', 'HM', 'NZ', 'NF', 'FJ', 'NC', 'PG', 'SB', 'VU', 'GU', 'KI', 'MH', 'FM', 'NR', 'MP', 'PW', 'AS', 'CK', 'PF', 'NU', 'PN', 'WS', 'TK', 'TO', 'TV', 'WF'}
}

print(f"Project root: {PROJECT_ROOT}")
print(f"CLDR directory: {CLDR_DIR}")
print(f"CLDR directory exists: {CLDR_DIR.exists()}")

Project root: /home/jb/PycharmProjects/localization-statistics
CLDR directory: /home/jb/PycharmProjects/localization-statistics/data/CLDR-raw
CLDR directory exists: True

def load_cldr_data(cldr_dir: Path) -> Tuple[Dict, Dict]:
    """
    Load CLDR JSON files from the specified directory.
    
    Args:
        cldr_dir: Path to the CLDR-raw directory
        
    Returns:
        Tuple of (territory_info_data, language_data)
        
    Raises:
        FileNotFoundError: If required CLDR files don't exist
    """
    territory_info_path = cldr_dir / "territoryInfo.json"
    language_data_path = cldr_dir / "languageData.json"
    
    if not territory_info_path.exists():
        raise FileNotFoundError(f"CLDR file not found: {territory_info_path}")
    if not language_data_path.exists():
        raise FileNotFoundError(f"CLDR file not found: {language_data_path}")
    
    with open(territory_info_path, 'r', encoding='utf-8') as f:
        territory_info_data = json.load(f)
    
    with open(language_data_path, 'r', encoding='utf-8') as f:
        language_data = json.load(f)
    
    return territory_info_data, language_data


def extract_cldr_languages(territory_info_data: Dict, language_data: Dict) -> Set[str]:
    """
    Extract the complete set of CLDR language codes from both data sources.
    
    Combines languages from:
    - languageData.json (script/writing system info)
    - territoryInfo.json (population data)
    
    Filters out alternate forms (e.g., 'aa-alt-secondary').
    
    Args:
        territory_info_data: Parsed territoryInfo.json
        language_data: Parsed languageData.json
        
    Returns:
        Set of primary language codes
    """
    # From languageData.json
    lang_data_section = language_data.get('supplemental', {}).get('languageData', {})
    languages_from_lang_data = {
        lang for lang in lang_data_section.keys()
        if '-alt-' not in lang
    }
    
    # From territoryInfo.json
    territory_info = territory_info_data.get('supplemental', {}).get('territoryInfo', {})
    languages_from_territory = set()
    for territory_data in territory_info.values():
        lang_pop = territory_data.get('languagePopulation', {})
        languages_from_territory.update(lang_pop.keys())
    
    return languages_from_lang_data | languages_from_territory


def compute_speaker_estimates(territory_info_data: Dict) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
    """
    Calculate estimated speaker counts and territory mappings per language.
    
    Formula: speakers(lang, territory) = population × (language_percent / 100)
    
    Args:
        territory_info_data: Parsed territoryInfo.json
        
    Returns:
        Tuple of (speaker_counts dict, language_territories dict)
    """
    territory_info = territory_info_data.get('supplemental', {}).get('territoryInfo', {})
    speaker_counts = {}
    lang_territories = {}
    
    for territory_code, territory_data in territory_info.items():
        # Get territory population
        try:
            territory_population = int(territory_data.get('_population', '0'))
        except (ValueError, TypeError):
            continue
        
        # Process each language in this territory
        lang_pop = territory_data.get('languagePopulation', {})
        for lang_code, lang_info in lang_pop.items():
            # Get population percentage
            try:
                pop_percent = float(lang_info.get('_populationPercent', '0'))
            except (ValueError, TypeError):
                continue
            
            # Calculate speakers
            speakers = int(territory_population * pop_percent / 100)
            speaker_counts[lang_code] = speaker_counts.get(lang_code, 0) + speakers
            
            # Track territories
            if lang_code not in lang_territories:
                lang_territories[lang_code] = []
            lang_territories[lang_code].append(territory_code)
    
    # Sort territory lists
    for lang_code in lang_territories:
        lang_territories[lang_code].sort()
    
    return speaker_counts, lang_territories


def detect_fedora_languages(stats_dir: Path, release: str = "f43") -> Tuple[Set[str], bool]:
    """
    Detect Fedora languages by scanning CSV files in the stats directory.
    
    Args:
        stats_dir: Base stats directory
        release: Fedora release to scan (default: 'f43')
        
    Returns:
        Tuple of (set of language codes, bool indicating if folder exists)
    """
    release_dir = stats_dir / release / "languages"
    
    if not release_dir.exists():
        return set(), False
    
    # Exclude special files
    special_files = {'distribution', 'error'}
    
    languages = set()
    for csv_file in release_dir.glob('*.csv'):
        lang_code = csv_file.stem
        if lang_code not in special_files:
            languages.add(lang_code)
    
    return languages, True


def get_primary_region(territories: List[str], region_mapping: Dict[str, Set[str]]) -> str:
    """
    Determine the primary region for a language based on its territories.
    
    Returns the region with the most territories for this language.
    
    Args:
        territories: List of territory codes where the language is spoken
        region_mapping: Dict mapping region names to sets of territory codes
        
    Returns:
        Region name or 'Unknown' if no match
    """
    if not territories:
        return 'Unknown'
    
    region_counts = {region: 0 for region in region_mapping}
    
    for territory in territories:
        for region, region_territories in region_mapping.items():
            if territory in region_territories:
                region_counts[region] += 1
                break
    
    # Return region with highest count, or 'Unknown' if all zeros
    max_region = max(region_counts, key=region_counts.get)
    return max_region if region_counts[max_region] > 0 else 'Unknown'


def get_speaker_bucket(speakers: Optional[int]) -> str:
    """
    Categorize speaker count into buckets.
    
    Args:
        speakers: Estimated speaker count (may be None)
        
    Returns:
        Bucket label: '<1M', '1–10M', '10–100M', '>100M', or 'Unknown'
    """
    if speakers is None or pd.isna(speakers):
        return 'Unknown'
    elif speakers < 1_000_000:
        return '<1M'
    elif speakers < 10_000_000:
        return '1–10M'
    elif speakers < 100_000_000:
        return '10–100M'
    else:
        return '>100M'


def build_alignment_dataframe(
    cldr_langs: Set[str],
    fedora_langs: Set[str],
    speaker_estimates: Dict[str, int],
    territory_mapping: Dict[str, List[str]],
    region_mapping: Dict[str, Set[str]]
) -> pd.DataFrame:
    """
    Build the CLDR ↔ Fedora language alignment DataFrame with enriched columns.
    
    Columns:
    - language_code: Language identifier
    - in_cldr: Boolean - in CLDR?
    - in_fedora: Boolean - in Fedora?
    - estimated_speakers: Total estimated speakers
    - territories: Comma-separated territory list
    - log_speakers: log10(speakers), NaN-safe
    - speaker_bucket: <1M, 1–10M, 10–100M, >100M, Unknown
    - region: Primary region (Africa, Americas, Asia, Europe, Oceania, Unknown)
    
    Args:
        cldr_langs: Set of CLDR language codes
        fedora_langs: Set of Fedora language codes
        speaker_estimates: Dict of language -> speaker count
        territory_mapping: Dict of language -> territory list
        region_mapping: Dict of region -> territory set
        
    Returns:
        Enriched alignment DataFrame
    """
    all_languages = cldr_langs | fedora_langs
    
    records = []
    for lang_code in sorted(all_languages):
        territories = territory_mapping.get(lang_code, [])
        speakers = speaker_estimates.get(lang_code)
        
        # Calculate log_speakers (NaN-safe)
        if speakers is not None and speakers > 0:
            log_speakers = np.log10(speakers)
        else:
            log_speakers = np.nan
        
        records.append({
            'language_code': lang_code,
            'in_cldr': lang_code in cldr_langs,
            'in_fedora': lang_code in fedora_langs,
            'estimated_speakers': speakers,
            'territories': ', '.join(territories) if territories else None,
            'log_speakers': log_speakers,
            'speaker_bucket': get_speaker_bucket(speakers),
            'region': get_primary_region(territories, region_mapping)
        })
    
    df = pd.DataFrame(records)
    
    # Set appropriate dtypes
    df['in_cldr'] = df['in_cldr'].astype(bool)
    df['in_fedora'] = df['in_fedora'].astype(bool)
    df['estimated_speakers'] = pd.to_numeric(df['estimated_speakers'], errors='coerce').astype('Int64')
    
    # Make speaker_bucket categorical with proper order
    bucket_order = ['<1M', '1–10M', '10–100M', '>100M', 'Unknown']
    df['speaker_bucket'] = pd.Categorical(df['speaker_bucket'], categories=bucket_order, ordered=True)
    
    # Make region categorical
    region_order = ['Africa', 'Americas', 'Asia', 'Europe', 'Oceania', 'Unknown']
    df['region'] = pd.Categorical(df['region'], categories=region_order, ordered=True)
    
    return df

# Load CLDR data
territory_info_data, language_data = load_cldr_data(CLDR_DIR)
print(f"Loaded CLDR version: {territory_info_data['supplemental']['version']['_cldrVersion']}")

# Extract CLDR languages
cldr_languages = extract_cldr_languages(territory_info_data, language_data)
print(f"CLDR languages: {len(cldr_languages)}")

# Compute speaker estimates and territory mapping
speaker_estimates, language_territories = compute_speaker_estimates(territory_info_data)
print(f"Languages with speaker data: {len(speaker_estimates)}")

# Detect Fedora languages
fedora_languages, stats_found = detect_fedora_languages(FEDORA_STATS_DIR, DEFAULT_FEDORA_RELEASE)

if stats_found:
    print(f"Fedora languages (from {DEFAULT_FEDORA_RELEASE}): {len(fedora_languages)}")
else:
    print(f"ℹ️  Stats folder not found. Running in DEMO MODE with empty Fedora set.")

# Build enriched alignment DataFrame
alignment_df = build_alignment_dataframe(
    cldr_languages,
    fedora_languages,
    speaker_estimates,
    language_territories,
    REGION_MAPPING
)

print(f"\nAlignment DataFrame: {len(alignment_df)} languages, {len(alignment_df.columns)} columns")
print(f"Columns: {list(alignment_df.columns)}")

Loaded CLDR version: 48
CLDR languages: 883
Languages with speaker data: 778
Fedora languages (from f43): 344

Alignment DataFrame: 966 languages, 8 columns
Columns: ['language_code', 'in_cldr', 'in_fedora', 'estimated_speakers', 'territories', 'log_speakers', 'speaker_bucket', 'region']

# Preview the enriched DataFrame
print("Sample data (first 15 rows):")
alignment_df.head(15)

Sample data (first 15 rows):

def summarize_coverage(df: pd.DataFrame, stats_found: bool) -> None:
    """
    Print a comprehensive coverage summary.
    """
    total = len(df)
    cldr_total = len(df[df['in_cldr']])
    fedora_total = len(df[df['in_fedora']])
    in_both = len(df[df['in_cldr'] & df['in_fedora']])
    cldr_only = len(df[df['in_cldr'] & ~df['in_fedora']])
    fedora_only = len(df[~df['in_cldr'] & df['in_fedora']])
    
    print("=" * 55)
    print("CLDR ↔ Fedora Language Coverage Summary")
    print("=" * 55)
    print(f"Total unique languages:           {total:>6}")
    print(f"Languages in CLDR:                {cldr_total:>6}")
    print(f"Languages in Fedora:              {fedora_total:>6}")
    print(f"  → In both (overlap):            {in_both:>6}")
    print(f"  → CLDR only (Fedora gaps):      {cldr_only:>6}")
    print(f"  → Fedora only (not in CLDR):    {fedora_only:>6}")
    print("=" * 55)
    
    if cldr_total > 0 and stats_found:
        coverage_pct = (in_both / cldr_total) * 100
        print(f"\nFedora covers {coverage_pct:.1f}% of CLDR languages")
        
        # Speaker-weighted coverage
        total_speakers = df[df['in_cldr']]['estimated_speakers'].sum()
        covered_speakers = df[df['in_cldr'] & df['in_fedora']]['estimated_speakers'].sum()
        if total_speakers and total_speakers > 0:
            speaker_coverage = (covered_speakers / total_speakers) * 100
            print(f"Fedora covers {speaker_coverage:.1f}% of estimated speakers")


summarize_coverage(alignment_df, stats_found)

=======================================================
CLDR ↔ Fedora Language Coverage Summary
=======================================================
Total unique languages:              966
Languages in CLDR:                   883
Languages in Fedora:                 344
  → In both (overlap):               261
  → CLDR only (Fedora gaps):         622
  → Fedora only (not in CLDR):        83
=======================================================

Fedora covers 29.6% of CLDR languages
Fedora covers 72.0% of estimated speakers

def analyze_by_speaker_bucket(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create analysis table by speaker bucket.
    """
    # Filter to CLDR languages only for this analysis
    cldr_df = df[df['in_cldr']].copy()
    
    # Group by speaker bucket
    bucket_stats = cldr_df.groupby('speaker_bucket', observed=True).agg(
        cldr_count=('language_code', 'count'),
        fedora_count=('in_fedora', 'sum'),
        total_speakers=('estimated_speakers', 'sum')
    ).reset_index()
    
    # Calculate coverage ratio
    bucket_stats['coverage_ratio'] = (
        bucket_stats['fedora_count'] / bucket_stats['cldr_count'] * 100
    ).round(1)
    
    bucket_stats['fedora_count'] = bucket_stats['fedora_count'].astype(int)
    
    return bucket_stats


speaker_bucket_analysis = analyze_by_speaker_bucket(alignment_df)
print("CLDR Language Count and Fedora Coverage by Speaker Bucket:")
print("(Coverage ratio = Fedora languages / CLDR languages × 100)")
print()
speaker_bucket_analysis

CLDR Language Count and Fedora Coverage by Speaker Bucket:
(Coverage ratio = Fedora languages / CLDR languages × 100)

def analyze_by_region(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create analysis table by region.
    """
    # Filter to CLDR languages only
    cldr_df = df[df['in_cldr']].copy()
    
    # Group by region
    region_stats = cldr_df.groupby('region', observed=True).agg(
        cldr_count=('language_code', 'count'),
        fedora_count=('in_fedora', 'sum'),
        total_speakers=('estimated_speakers', 'sum')
    ).reset_index()
    
    # Calculate coverage ratio
    region_stats['coverage_ratio'] = (
        region_stats['fedora_count'] / region_stats['cldr_count'] * 100
    ).round(1)
    
    region_stats['fedora_count'] = region_stats['fedora_count'].astype(int)
    
    return region_stats


region_analysis = analyze_by_region(alignment_df)
print("CLDR Language Count and Fedora Coverage by Region:")
print("(Coverage ratio = Fedora languages / CLDR languages × 100)")
print()
region_analysis

CLDR Language Count and Fedora Coverage by Region:
(Coverage ratio = Fedora languages / CLDR languages × 100)

def plot_speaker_bucket_comparison(bucket_df: pd.DataFrame) -> None:
    """
    Create a grouped bar chart comparing CLDR and Fedora language counts by speaker bucket.
    Uses grayscale colors and hatching for accessibility.
    """
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Filter out Unknown bucket for cleaner visualization
    plot_df = bucket_df[bucket_df['speaker_bucket'] != 'Unknown'].copy()
    
    x = np.arange(len(plot_df))
    width = 0.35
    
    # CLDR bars (solid gray)
    bars1 = ax.bar(x - width/2, plot_df['cldr_count'], width, 
                   label='CLDR', color='#888888', edgecolor='black')
    
    # Fedora bars (hatched)
    bars2 = ax.bar(x + width/2, plot_df['fedora_count'], width,
                   label='Fedora', color='#CCCCCC', edgecolor='black', hatch='//')
    
    # Add coverage ratio annotations
    for i, (_, row) in enumerate(plot_df.iterrows()):
        ax.annotate(f"{row['coverage_ratio']:.0f}%",
                    xy=(x[i] + width/2, row['fedora_count']),
                    ha='center', va='bottom', fontsize=9)
    
    ax.set_xlabel('Speaker Bucket', fontsize=12)
    ax.set_ylabel('Number of Languages', fontsize=12)
    ax.set_title('CLDR vs Fedora Languages by Estimated Speaker Count', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(plot_df['speaker_bucket'].astype(str))
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()


plot_speaker_bucket_comparison(speaker_bucket_analysis)

def plot_region_comparison(region_df: pd.DataFrame) -> None:
    """
    Create a grouped bar chart comparing CLDR and Fedora language counts by region.
    Uses grayscale colors and hatching for accessibility.
    """
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Filter out Unknown region
    plot_df = region_df[region_df['region'] != 'Unknown'].copy()
    
    x = np.arange(len(plot_df))
    width = 0.35
    
    # CLDR bars (solid gray)
    bars1 = ax.bar(x - width/2, plot_df['cldr_count'], width,
                   label='CLDR', color='#888888', edgecolor='black')
    
    # Fedora bars (hatched)
    bars2 = ax.bar(x + width/2, plot_df['fedora_count'], width,
                   label='Fedora', color='#CCCCCC', edgecolor='black', hatch='//')
    
    # Add coverage ratio annotations
    for i, (_, row) in enumerate(plot_df.iterrows()):
        ax.annotate(f"{row['coverage_ratio']:.0f}%",
                    xy=(x[i] + width/2, row['fedora_count']),
                    ha='center', va='bottom', fontsize=9)
    
    ax.set_xlabel('Region', fontsize=12)
    ax.set_ylabel('Number of Languages', fontsize=12)
    ax.set_title('CLDR vs Fedora Languages by Geographic Region', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(plot_df['region'].astype(str))
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()


plot_region_comparison(region_analysis)

def plot_speakers_vs_coverage(df: pd.DataFrame) -> None:
    """
    Scatter plot showing estimated speakers vs Fedora coverage status.
    Languages with Fedora translations are marked differently.
    """
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Filter to CLDR languages with speaker data
    plot_df = df[df['in_cldr'] & df['log_speakers'].notna()].copy()
    
    # Separate covered and not covered
    covered = plot_df[plot_df['in_fedora']].copy()
    not_covered = plot_df[~plot_df['in_fedora']].copy()
    
    # Create jittered y-axis for visibility (based on region)
    region_y = {'Africa': 1, 'Americas': 2, 'Asia': 3, 'Europe': 4, 'Oceania': 5, 'Unknown': 0}
    
    # Add jitter - convert categorical to string first for mapping
    np.random.seed(42)  # Reproducibility
    
    covered_y = covered['region'].astype(str).map(region_y).values + np.random.uniform(-0.3, 0.3, len(covered))
    not_covered_y = not_covered['region'].astype(str).map(region_y).values + np.random.uniform(-0.3, 0.3, len(not_covered))
    
    # Plot not covered (empty circles)
    ax.scatter(not_covered['log_speakers'], not_covered_y,
               s=50, c='white', edgecolors='#666666', linewidth=1.5,
               label='Not in Fedora', alpha=0.7)
    
    # Plot covered (filled circles with hatch-like pattern)
    ax.scatter(covered['log_speakers'], covered_y,
               s=50, c='#333333', edgecolors='black', linewidth=1,
               label='In Fedora', alpha=0.8)
    
    # Customize axes
    ax.set_xlabel('Log₁₀(Estimated Speakers)', fontsize=12)
    ax.set_ylabel('Region (jittered)', fontsize=12)
    ax.set_title('Language Distribution: Speakers vs Region\n(Fedora coverage indicated by fill)', fontsize=14)
    
    ax.set_yticks(list(region_y.values()))
    ax.set_yticklabels(list(region_y.keys()))
    ax.set_ylim(-0.5, 5.5)
    
    ax.legend(loc='upper left')
    ax.grid(alpha=0.3)
    
    # Add reference lines for speaker thresholds
    for threshold, label in [(6, '1M'), (7, '10M'), (8, '100M')]:
        ax.axvline(x=threshold, color='#AAAAAA', linestyle='--', linewidth=1, alpha=0.5)
        ax.text(threshold, 5.3, label, ha='center', fontsize=9, color='#666666')
    
    plt.tight_layout()
    plt.show()


plot_speakers_vs_coverage(alignment_df)

def generate_voronoi_coordinates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Generate deterministic 2D coordinates for Voronoi diagram.
    
    X-axis: log_speakers (with imputation for missing values)
    Y-axis: region index + deterministic jitter based on language code hash
    
    Args:
        df: Alignment DataFrame with log_speakers and region columns
        
    Returns:
        DataFrame with added 'voronoi_x' and 'voronoi_y' columns
    """
    result = df.copy()
    
    # Region to Y mapping
    region_y = {'Africa': 1, 'Americas': 2, 'Asia': 3, 'Europe': 4, 'Oceania': 5, 'Unknown': 0}
    
    # X-coordinate: log_speakers with imputation
    # For missing speakers, use minimum log_speakers minus 1
    min_log = result['log_speakers'].min()
    if pd.isna(min_log):
        min_log = 3  # Default fallback (~1000 speakers)
    
    result['voronoi_x'] = result['log_speakers'].fillna(min_log - 1)
    
    # Y-coordinate: region + deterministic jitter
    def get_jitter(lang_code: str) -> float:
        """Generate deterministic jitter from language code hash."""
        hash_val = int(hashlib.md5(lang_code.encode()).hexdigest()[:8], 16)
        return (hash_val % 1000) / 1000 * 0.8 - 0.4  # Range: [-0.4, 0.4]
    
    result['voronoi_y'] = result.apply(
        lambda row: region_y.get(str(row['region']), 0) + get_jitter(row['language_code']),
        axis=1
    )
    
    return result


# Generate coordinates
voronoi_df = generate_voronoi_coordinates(alignment_df[alignment_df['in_cldr']].copy())
print(f"Prepared {len(voronoi_df)} CLDR languages for Voronoi diagram")
print(f"X range: {voronoi_df['voronoi_x'].min():.2f} to {voronoi_df['voronoi_x'].max():.2f}")
print(f"Y range: {voronoi_df['voronoi_y'].min():.2f} to {voronoi_df['voronoi_y'].max():.2f}")

Prepared 883 CLDR languages for Voronoi diagram
X range: -1.00 to 9.24
Y range: -0.39 to 5.40

def plot_abstract_voronoi(df: pd.DataFrame) -> None:
    """
    Create an improved abstract Voronoi diagram showing language coverage.
    
    Features:
    - Each cell represents a CLDR language
    - X-axis: log10(speakers) - larger = more speakers
    - Y-axis: region bands (Africa, Americas, Asia, Europe, Oceania)
    - Green cells: language has Fedora translations
    - Light gray cells: language missing from Fedora
    - Labels for top languages by speaker count
    """
    fig, ax = plt.subplots(figsize=(16, 12))
    
    # Extract points
    points = df[['voronoi_x', 'voronoi_y']].values
    
    # Need at least 4 points for Voronoi
    if len(points) < 4:
        ax.text(0.5, 0.5, 'Insufficient data for Voronoi diagram',
                ha='center', va='center', transform=ax.transAxes, fontsize=14)
        plt.show()
        return
    
    # Define cleaner bounds with padding
    x_min, x_max = points[:, 0].min() - 0.5, points[:, 0].max() + 0.5
    y_min, y_max = -0.6, 5.6  # Fixed bounds for region strips
    
    # Add far-away boundary points for bounded Voronoi cells
    boundary_points = np.array([
        [x_min - 20, y_min - 20],
        [x_min - 20, y_max + 20],
        [x_max + 20, y_min - 20],
        [x_max + 20, y_max + 20],
        [(x_min + x_max)/2, y_min - 20],
        [(x_min + x_max)/2, y_max + 20],
        [x_min - 20, (y_min + y_max)/2],
        [x_max + 20, (y_min + y_max)/2]
    ])
    
    all_points = np.vstack([points, boundary_points])
    
    # Compute Voronoi tessellation
    vor = Voronoi(all_points)
    
    # Get data for coloring and labeling
    in_fedora = df['in_fedora'].values
    lang_codes = df['language_code'].values
    speakers = df['estimated_speakers'].fillna(0).values
    
    # Clip polygon to bounds
    def clip_polygon(polygon, x_min, x_max, y_min, y_max):
        """Clip polygon to rectangular bounds."""
        from matplotlib.path import Path
        import matplotlib.patches as patches
        
        clipped = []
        for x, y in polygon:
            cx = max(x_min, min(x_max, x))
            cy = max(y_min, min(y_max, y))
            clipped.append([cx, cy])
        return np.array(clipped)
    
    # Draw region background strips for visual separation
    region_colors = ['#F8F8F8', '#FFFFFF']
    for i in range(6):
        rect = plt.Rectangle((x_min, i - 0.5), x_max - x_min, 1, 
                              facecolor=region_colors[i % 2], edgecolor='none', alpha=0.5, zorder=0)
        ax.add_patch(rect)
    
    # Draw Voronoi regions
    for idx in range(len(points)):
        region_idx = vor.point_region[idx]
        if region_idx == -1:
            continue
        
        region = vor.regions[region_idx]
        if not region or -1 in region:
            continue
        
        # Get polygon vertices
        polygon = np.array([vor.vertices[i] for i in region])
        
        # Clip polygon to visible bounds
        polygon = clip_polygon(polygon, x_min, x_max, y_min, y_max)
        
        # Skip if polygon is degenerate
        if len(polygon) < 3:
            continue
        
        # Choose style based on Fedora coverage - use distinct colors
        if in_fedora[idx]:
            # Green tint for Fedora languages
            poly = plt.Polygon(polygon, facecolor='#90EE90', edgecolor='#2E8B57',
                               linewidth=0.8, alpha=0.7, zorder=1)
        else:
            # Light gray for missing languages
            poly = plt.Polygon(polygon, facecolor='#E8E8E8', edgecolor='#AAAAAA',
                               linewidth=0.5, alpha=0.6, zorder=1)
        
        ax.add_patch(poly)
    
    # Plot points with better visibility
    fedora_mask = df['in_fedora'].values
    
    # Non-Fedora points (hollow circles)
    ax.scatter(points[~fedora_mask, 0], points[~fedora_mask, 1],
               s=30, c='white', edgecolors='#888888', linewidth=1,
               zorder=4, label='Not in Fedora', marker='o')
    
    # Fedora points (filled circles)
    ax.scatter(points[fedora_mask, 0], points[fedora_mask, 1],
               s=40, c='#228B22', edgecolors='#145214', linewidth=1,
               zorder=5, label='In Fedora', marker='o')
    
    # Add labels for top languages (by speaker count) to help interpretation
    # Get top 15 languages overall
    top_indices = np.argsort(speakers)[-15:]
    
    for idx in top_indices:
        if speakers[idx] > 0:
            x, y = points[idx]
            label = lang_codes[idx]
            # Color based on Fedora coverage
            color = '#145214' if in_fedora[idx] else '#666666'
            fontweight = 'bold' if in_fedora[idx] else 'normal'
            
            # Add label with offset
            ax.annotate(label, (x, y), 
                       xytext=(5, 5), textcoords='offset points',
                       fontsize=8, color=color, fontweight=fontweight,
                       zorder=6, alpha=0.9)
    
    # Set axis limits
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    
    # Draw region separator lines
    for y_val in [0.5, 1.5, 2.5, 3.5, 4.5]:
        ax.axhline(y=y_val, color='#CCCCCC', linestyle='-', linewidth=0.5, zorder=2)
    
    # Labels and title
    ax.set_xlabel('Estimated Speakers (log₁₀ scale) →\nSmaller ← → Larger', fontsize=12, labelpad=10)
    ax.set_ylabel('Geographic Region', fontsize=12)
    
    title_text = ('Abstract Voronoi Diagram: Language Coverage Landscape\n'
                  'Each cell = one language | Green = Fedora translated | Gray = Missing')
    ax.set_title(title_text, fontsize=14, fontweight='bold', pad=15)
    
    # Add region labels on y-axis
    region_labels = ['Unknown', 'Africa', 'Americas', 'Asia', 'Europe', 'Oceania']
    ax.set_yticks([0, 1, 2, 3, 4, 5])
    ax.set_yticklabels(region_labels, fontsize=11)
    
    # Add speaker threshold reference lines with better visibility
    speaker_thresholds = [
        (4, '10K', '#DDDDDD'),
        (5, '100K', '#CCCCCC'),
        (6, '1M', '#AAAAAA'),
        (7, '10M', '#888888'),
        (8, '100M', '#666666'),
        (9, '1B', '#444444')
    ]
    
    for threshold, label, color in speaker_thresholds:
        if x_min < threshold < x_max:
            ax.axvline(x=threshold, color=color, linestyle='--', linewidth=1.5, zorder=3, alpha=0.7)
            ax.text(threshold, y_max + 0.15, label, ha='center', fontsize=10, 
                   color=color, fontweight='bold')
    
    # Create custom legend
    legend_elements = [
        mpatches.Patch(facecolor='#90EE90', edgecolor='#2E8B57', linewidth=1.5,
                       label='✓ In Fedora (translated)'),
        mpatches.Patch(facecolor='#E8E8E8', edgecolor='#AAAAAA', linewidth=1,
                       label='✗ Not in Fedora (gap)'),
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='#228B22',
                   markeredgecolor='#145214', markersize=10, label='Language point (Fedora)'),
        plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='white',
                   markeredgecolor='#888888', markersize=10, label='Language point (Missing)'),
    ]
    ax.legend(handles=legend_elements, loc='upper left', fontsize=10, 
              framealpha=0.95, edgecolor='#CCCCCC')
    
    # Add explanatory note box
    explanation = (
        "HOW TO READ THIS DIAGRAM:\n"
        "• Each CELL represents one language from CLDR\n"
        "• Cell POSITION: X = speaker count, Y = region\n"
        "• Cell SIZE: larger cells = more 'unique' in the landscape\n"
        "• GREEN = Fedora has translations\n"
        "• GRAY = potential translation opportunity\n"
        "• Labels show top 15 languages by speakers"
    )
    ax.text(0.98, 0.02, explanation,
            transform=ax.transAxes, fontsize=9, verticalalignment='bottom',
            horizontalalignment='right', family='monospace',
            bbox=dict(boxstyle='round,pad=0.5', facecolor='#FFFFEE', 
                     edgecolor='#CCCC99', alpha=0.95))
    
    # Add "NOT A MAP" warning
    ax.text(0.5, 0.98, '⚠️ CONCEPTUAL DIAGRAM - NOT A GEOGRAPHIC MAP ⚠️',
            transform=ax.transAxes, fontsize=11, ha='center', va='top',
            color='#CC6600', fontweight='bold',
            bbox=dict(boxstyle='round', facecolor='#FFF8E7', edgecolor='#FFCC80'))
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    total = len(df)
    fedora_count = in_fedora.sum()
    print(f"\n📊 Voronoi Summary: {fedora_count}/{total} languages ({fedora_count/total*100:.1f}%) have Fedora translations")


plot_abstract_voronoi(voronoi_df)

📊 Voronoi Summary: 261/883 languages (29.6%) have Fedora translations

# Top CLDR languages missing from Fedora
missing_from_fedora = alignment_df[
    alignment_df['in_cldr'] & 
    ~alignment_df['in_fedora'] &
    alignment_df['estimated_speakers'].notna()
].sort_values('estimated_speakers', ascending=False)

if len(missing_from_fedora) > 0:
    print(f"Top 20 CLDR languages missing from Fedora (by estimated speakers):")
    print()
    display_cols = ['language_code', 'estimated_speakers', 'speaker_bucket', 'region']
    print(missing_from_fedora[display_cols].head(20).to_string(index=False))
else:
    if not stats_found:
        print("ℹ️  Running in demo mode — no Fedora data to compare.")
    else:
        print("✓ Fedora covers all CLDR languages with speaker estimates!")

Top 20 CLDR languages missing from Fedora (by estimated speakers):

language_code  estimated_speakers speaker_bucket region
           zh          1286444444          >100M   Asia
      pa_Arab           176654800          >100M   Asia
          lah           100980828          >100M   Asia
          wuu            84962400        10–100M   Asia
          yue            81532220        10–100M   Asia
     yue_Hans            73634080        10–100M   Asia
          arz            71198080        10–100M Africa
          pcm            49716870        10–100M Africa
          apc            43682527        10–100M   Asia
          hsn            41065160        10–100M   Asia
          arq            39028675        10–100M Africa
          bho            34639015        10–100M   Asia
          hak            32568919        10–100M   Asia
          ary            32527212        10–100M Africa
          apd            30785053        10–100M Africa
          skr            30283680        10–100M   Asia
          ceb            28386480        10–100M   Asia
          awa            27458162        10–100M   Asia
          nan            26904760        10–100M   Asia
          gan            24072680        10–100M   Asia

# Final summary
print("="*60)
print("NOTEBOOK COMPLETE")
print("="*60)
print(f"\nDataFrame 'alignment_df' ready with {len(alignment_df)} languages")
print(f"Columns: {list(alignment_df.columns)}")
print(f"\nSample:")
alignment_df[alignment_df['in_fedora']].sample(min(5, len(alignment_df[alignment_df['in_fedora']])), random_state=42)

============================================================
NOTEBOOK COMPLETE
============================================================

DataFrame 'alignment_df' ready with 966 languages
Columns: ['language_code', 'in_cldr', 'in_fedora', 'estimated_speakers', 'territories', 'log_speakers', 'speaker_bucket', 'region']

Sample:

CLDR ↔ Fedora Language Alignment¶

Purpose¶

Key Concepts¶

Analysis Components¶

Setup and Imports¶

Configuration¶

Core Functions¶

Load Data and Build Alignment¶

Coverage Summary¶

Analysis Tables¶

Speaker Bucket Analysis¶

Region Analysis¶

Visualizations¶

Bar Chart: CLDR vs Fedora by Speaker Bucket¶

Bar Chart: CLDR vs Fedora by Region¶

Scatter Plot: Speakers vs Fedora Coverage¶

Abstract Voronoi Diagram¶

Important Note on Interpretation¶

Top Missing Languages (Expansion Opportunities)¶

Summary¶

Data Alignment¶

Analysis¶

Visualizations (Grayscale + Hatching)¶

Key Insight¶

	language_code	in_cldr	in_fedora	estimated_speakers	territories	log_speakers	speaker_bucket	region
0	aa	True	True	2305971	DJ, ER, ET	6.362854	1–10M	Africa
1	ab	True	True	111858	GE, TR	5.048667	<1M	Asia
2	abq	True	False	29572	RU	4.470881	<1M	Europe
3	abq_Latn	True	False	12617	TR	4.100956	<1M	Asia
4	abr	True	False	1729455	GH	6.237909	1–10M	Africa
5	ace	True	True	3941868	ID	6.595702	1–10M	Asia
6	ach	True	True	1823471	UG	6.260899	1–10M	Africa
7	acr	True	False	200807	GT	5.302779	<1M	Americas
8	ada	True	False	1037673	GH	6.016061	1–10M	Africa
9	ady	True	False	451988	RU, TR	5.655127	<1M	Asia
10	ae	True	True	<NA>	NaN	NaN	Unknown	Unknown
11	aeb	True	False	10843920	TN	7.035186	10–100M	Africa
12	af	True	True	9966164	BW, NA, ZA	6.998528	1–10M	Africa
13	agq	True	False	43352	CM	4.637009	<1M	Africa
14	agu	True	False	47463	GT	4.676355	<1M	Americas

	speaker_bucket	cldr_count	fedora_count	total_speakers	coverage_ratio
0	<1M	433	90	103386587	20.8
1	1–10M	239	89	845069026	37.2
2	10–100M	89	60	3023800073	67.4
3	>100M	17	14	6871383544	82.4
4	Unknown	105	8	0	7.6

	region	cldr_count	fedora_count	total_speakers	coverage_ratio
0	Africa	231	59	2786703644	25.5
1	Americas	104	26	2608072054	25.0
2	Asia	271	66	4624323138	24.4
3	Europe	142	88	813187626	62.0
4	Oceania	29	13	9889722	44.8
5	Unknown	106	9	1463046	8.5

	language_code	in_cldr	in_fedora	estimated_speakers	territories	log_speakers	speaker_bucket	region
511	ltg	True	True	160311	LV	5.204963	<1M	Europe
373	isv	False	True	<NA>	NaN	NaN	Unknown	Unknown
611	ne	True	True	21735356	BT, IN, NP	7.337167	10–100M	Asia
567	mni	True	True	1568599	BD, IN	6.195512	1–10M	Asia
877	udm	True	True	535119	RU	5.728450	<1M	Europe