In [1]:
import logging
import os

import pandas as pd
import utils
from plotnine import ggplot, aes, geom_point, expand_limits, labs, scale_x_date, facet_wrap

utils.set_logging()
log = logging.getLogger("GenerateDistibutionPlot")
log.info("α Starting")

ROOT_DIR = "./"

# Configuration
RELEASES = ["f7","f8", "f9",
            "f10", "f11", "f12", "f13", "f14", "f15", "f16","f17","f18","f19",
            "f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
            "f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "f38", "f39",
            "f40", "f41", "f42", "f43"]
BASE_PATH = os.path.join(ROOT_DIR, "..", "generated")

distribution_data = pd.DataFrame()

for release in RELEASES:
    dist_path = f"{BASE_PATH}/{release}/release.csv"
    if not os.path.exists(dist_path):
        log.warning(f"Missing release csv {dist_path}")

    df = pd.read_csv(dist_path)
    df["release"] = release

    distribution_data = pd.concat([distribution_data, df], ignore_index=True)

distribution_data['release_date'] = pd.to_datetime(distribution_data['release_date'], format='mixed')

log.info(distribution_data.columns.tolist())
2026-01-25 22:32:19,854 - 4129375920.py <module> - GenerateDistibutionPlot - None - INFO - α Starting
2026-01-25 22:32:19,901 - 4129375920.py <module> - GenerateDistibutionPlot - None - INFO - ['Unnamed: 0', 'packages_count', 'packages_detected_count', 'files_detected_count', 'packages_processed_count', 'files_processed_count', 'totalsourcewords', 'languages_processed_count', 'release_date', 'release']
In [2]:
# Generate number of languages plot
(ggplot(distribution_data, aes(x="release_date", y="languages_processed_count"))
+ geom_point()
+ expand_limits(y=0)
+ labs(title = 'Number of unique languages',
       x = 'Release time',
       y = 'Number of languages')
+ scale_x_date(date_labels = "%Y")
)
Out[2]:
No description has been provided for this image
In [3]:
# Generate number of packages vs number of packages with translation plot
data = pd.melt(distribution_data, id_vars=["release_date"], value_vars=["packages_count", "packages_processed_count"])
(ggplot(data, aes(x="release_date", y="value"))
+ geom_point()
+ expand_limits(y=0)
+ labs(title = 'Number of packages (srpms)',
       x = 'Release time',
       y = 'Number of packages')
+ scale_x_date(date_labels = "%Y")
+ facet_wrap("variable")
)
Out[3]:
No description has been provided for this image
In [4]:
# Generate total source words plot
(ggplot(distribution_data, aes(x="release_date", y="totalsourcewords"))
                        + geom_point()
                        + expand_limits(y=0)
                        + labs(title = 'Number of English words to translate',
                               x = 'Release time',
                               y = 'Number of words')
                        + scale_x_date(date_labels = "%Y")
)
Out[4]:
No description has been provided for this image
In [5]:
log.info("Done ⵣ")
2026-01-25 22:32:20,712 - 2110017864.py <module> - GenerateDistibutionPlot - None - INFO - Done ⵣ