In [1]:
import logging
import os
import pandas as pd
import utils
from plotnine import ggplot, aes, geom_point, expand_limits, labs, scale_x_date, facet_wrap
utils.set_logging()
log = logging.getLogger("GenerateDistibutionPlot")
log.info("α Starting")
ROOT_DIR = "./"
# Configuration
RELEASES = ["f7","f8", "f9",
"f10", "f11", "f12", "f13", "f14", "f15", "f16","f17","f18","f19",
"f20", "f21", "f22", "f23", "f24", "f25", "f26", "f27", "f28", "f29",
"f30", "f31", "f32", "f33", "f34", "f35", "f36", "f37", "f38", "f39",
"f40", "f41", "f42", "f43"]
BASE_PATH = os.path.join(ROOT_DIR, "..", "generated")
distribution_data = pd.DataFrame()
for release in RELEASES:
dist_path = f"{BASE_PATH}/{release}/release.csv"
if not os.path.exists(dist_path):
log.warning(f"Missing release csv {dist_path}")
df = pd.read_csv(dist_path)
df["release"] = release
distribution_data = pd.concat([distribution_data, df], ignore_index=True)
distribution_data['release_date'] = pd.to_datetime(distribution_data['release_date'], format='mixed')
log.info(distribution_data.columns.tolist())
2026-01-25 22:32:19,854 - 4129375920.py <module> - GenerateDistibutionPlot - None - INFO - α Starting 2026-01-25 22:32:19,901 - 4129375920.py <module> - GenerateDistibutionPlot - None - INFO - ['Unnamed: 0', 'packages_count', 'packages_detected_count', 'files_detected_count', 'packages_processed_count', 'files_processed_count', 'totalsourcewords', 'languages_processed_count', 'release_date', 'release']
In [2]:
# Generate number of languages plot
(ggplot(distribution_data, aes(x="release_date", y="languages_processed_count"))
+ geom_point()
+ expand_limits(y=0)
+ labs(title = 'Number of unique languages',
x = 'Release time',
y = 'Number of languages')
+ scale_x_date(date_labels = "%Y")
)
Out[2]:
In [3]:
# Generate number of packages vs number of packages with translation plot
data = pd.melt(distribution_data, id_vars=["release_date"], value_vars=["packages_count", "packages_processed_count"])
(ggplot(data, aes(x="release_date", y="value"))
+ geom_point()
+ expand_limits(y=0)
+ labs(title = 'Number of packages (srpms)',
x = 'Release time',
y = 'Number of packages')
+ scale_x_date(date_labels = "%Y")
+ facet_wrap("variable")
)
Out[3]:
In [4]:
# Generate total source words plot
(ggplot(distribution_data, aes(x="release_date", y="totalsourcewords"))
+ geom_point()
+ expand_limits(y=0)
+ labs(title = 'Number of English words to translate',
x = 'Release time',
y = 'Number of words')
+ scale_x_date(date_labels = "%Y")
)
Out[4]:
In [5]:
log.info("Done ⵣ")
2026-01-25 22:32:20,712 - 2110017864.py <module> - GenerateDistibutionPlot - None - INFO - Done ⵣ