This notebook analyzes OpenStreetMap notes - map feedback reports that help improve data quality. Notes allow anyone to report issues or suggest improvements to the map.
df_long = duckdb.sql("""
WITH notes_created AS (
SELECT
YEAR(created_at) as year,
MONTH(created_at) as month,
COUNT(*) as created_count
FROM '../notes_data/*.parquet'
GROUP BY year, month
),
notes_closed AS (
SELECT
YEAR(closed_at) as year,
MONTH(closed_at) as month,
COUNT(*) as closed_count
FROM '../notes_data/*.parquet'
WHERE closed_at IS NOT NULL
GROUP BY year, month
),
all_metrics AS (
-- This CTE calculates both monthly and accumulated values in one place
SELECT
CONCAT(nc.year, '-', LPAD(CAST(nc.month as VARCHAR), 2, '0')) as months,
nc.created_count,
COALESCE(ncl.closed_count, 0) as closed_count,
SUM(nc.created_count) OVER (ORDER BY nc.year, nc.month) as accumulated_created,
SUM(COALESCE(ncl.closed_count, 0)) OVER (ORDER BY nc.year, nc.month) as accumulated_closed
FROM notes_created nc
LEFT JOIN notes_closed ncl ON nc.year = ncl.year AND nc.month = ncl.month
)
-- Reshape all metrics into a single long-format table with a 'Chart' column for filtering
SELECT months, created_count as "Count", 'Monthly' as "Chart", 'Created' as "Type" FROM all_metrics
UNION ALL
SELECT months, closed_count as "Count", 'Monthly' as "Chart", 'Closed' as "Type" FROM all_metrics
UNION ALL
SELECT months, accumulated_created as "Count", 'Accumulated' as "Chart", 'Created' as "Type" FROM all_metrics
UNION ALL
SELECT months, accumulated_closed as "Count", 'Accumulated' as "Chart", 'Closed' as "Type" FROM all_metrics
ORDER BY months, "Chart", "Type"
""").df()
# Filter the single DataFrame for each chart
df_monthly = df_long[df_long["Chart"] == "Monthly"]
df_accumulated = df_long[df_long["Chart"] == "Accumulated"]
util.show_figure(
[
util.FigureConfig(
title="Monthly Notes: Created vs. Closed",
label="Monthly",
x_col="months",
y_col="Count",
group_col="Type",
query_or_df=df_monthly,
trace_names=["Created", "Closed"],
),
util.FigureConfig(
title="Accumulated Notes: Created vs. Closed",
label="Accumulated",
x_col="months",
y_col="Count",
group_col="Type",
query_or_df=df_accumulated,
trace_names=["Created", "Closed"],
),
]
)
df = duckdb.sql("""
SELECT
mid_pos_x as x,
mid_pos_y as y,
COUNT(*) as z
FROM '../notes_data/*.parquet'
WHERE mid_pos_x IS NOT NULL AND mid_pos_y IS NOT NULL
GROUP BY mid_pos_x, mid_pos_y
""").df()
util.show_figure(
[
util.FigureConfig(
title="Notes Count",
x_col="x",
y_col="y",
z_col="z",
query_or_df=df,
plot_type="map",
)
]
)
df = duckdb.sql("""
SELECT
YEAR(timestamp) as year,
MONTH(timestamp) as month,
CONCAT(YEAR(timestamp), '-', LPAD(CAST(MONTH(timestamp) as VARCHAR), 2, '0')) as months,
COUNT(*) as "Comments",
COUNT(DISTINCT user_name) as "Commenters",
COUNT(DISTINCT note_id) as "Notes with Comments"
FROM '../notes_comments_data/*.parquet'
WHERE user_name != ''
GROUP BY YEAR(timestamp), MONTH(timestamp)
ORDER BY year, month
""").df()
util.show_figure(
[
util.FigureConfig(
title="Monthly Comments",
label="Comments",
x_col="months",
y_col="Comments",
query_or_df=df,
),
util.FigureConfig(
title="Monthly Commenters",
label="Commenters",
x_col="months",
y_col="Commenters",
query_or_df=df,
),
util.FigureConfig(
title="Monthly Notes with Comments",
label="Notes",
x_col="months",
y_col="Notes with Comments",
query_or_df=df,
),
]
)
df = duckdb.sql("""
SELECT
YEAR(timestamp) as year,
MONTH(timestamp) as month,
CONCAT(YEAR(timestamp), '-', LPAD(CAST(MONTH(timestamp) as VARCHAR), 2, '0')) as months,
action,
COUNT(*) as "Count"
FROM '../notes_comments_data/*.parquet'
GROUP BY YEAR(timestamp), MONTH(timestamp), action
ORDER BY year, month, action
""").df()
util.show_figure(
[
util.FigureConfig(
title="Monthly Comment Actions",
label="Actions",
x_col="months",
y_col="Count",
group_col="action",
query_or_df=df,
)
]
)
df = duckdb.sql("""
SELECT
YEAR(created_at) as year,
MONTH(created_at) as month,
CONCAT(YEAR(created_at), '-', LPAD(CAST(MONTH(created_at) as VARCHAR), 2, '0')) as months,
AVG(DATEDIFF('day', created_at, closed_at)) as "Average Days to Close",
MEDIAN(DATEDIFF('day', created_at, closed_at)) as "Median Days to Close",
COUNT(*) as "Closed Notes"
FROM '../notes_data/*.parquet'
WHERE closed_at IS NOT NULL
GROUP BY YEAR(created_at), MONTH(created_at)
ORDER BY year, month
""").df()
util.show_figure(
[
util.FigureConfig(
title="Average Days to Close Notes",
label="Average Days",
x_col="months",
y_col="Average Days to Close",
query_or_df=df,
),
util.FigureConfig(
title="Median Days to Close Notes",
label="Median Days",
x_col="months",
y_col="Median Days to Close",
query_or_df=df,
),
]
)