Notes Analysis

This notebook analyzes OpenStreetMap notes - map feedback reports that help improve data quality. Notes allow anyone to report issues or suggest improvements to the map.

Monthly Notes Activity

code
df_long = duckdb.sql("""
WITH notes_created AS (
    SELECT
        YEAR(created_at) as year,
        MONTH(created_at) as month,
        COUNT(*) as created_count
    FROM '../notes_data/*.parquet'
    GROUP BY year, month
),
notes_closed AS (
    SELECT
        YEAR(closed_at) as year,
        MONTH(closed_at) as month,
        COUNT(*) as closed_count
    FROM '../notes_data/*.parquet'
    WHERE closed_at IS NOT NULL
    GROUP BY year, month
),
all_metrics AS (
    -- This CTE calculates both monthly and accumulated values in one place
    SELECT
        CONCAT(nc.year, '-', LPAD(CAST(nc.month as VARCHAR), 2, '0')) as months,
        nc.created_count,
        COALESCE(ncl.closed_count, 0) as closed_count,
        SUM(nc.created_count) OVER (ORDER BY nc.year, nc.month) as accumulated_created,
        SUM(COALESCE(ncl.closed_count, 0)) OVER (ORDER BY nc.year, nc.month) as accumulated_closed
    FROM notes_created nc
    LEFT JOIN notes_closed ncl ON nc.year = ncl.year AND nc.month = ncl.month
)
-- Reshape all metrics into a single long-format table with a 'Chart' column for filtering
SELECT months, created_count as "Count", 'Monthly' as "Chart", 'Created' as "Type" FROM all_metrics
UNION ALL
SELECT months, closed_count as "Count", 'Monthly' as "Chart", 'Closed' as "Type" FROM all_metrics
UNION ALL
SELECT months, accumulated_created as "Count", 'Accumulated' as "Chart", 'Created' as "Type" FROM all_metrics
UNION ALL
SELECT months, accumulated_closed as "Count", 'Accumulated' as "Chart", 'Closed' as "Type" FROM all_metrics
ORDER BY months, "Chart", "Type"
""").df()

# Filter the single DataFrame for each chart
df_monthly = df_long[df_long["Chart"] == "Monthly"]
df_accumulated = df_long[df_long["Chart"] == "Accumulated"]

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Notes: Created vs. Closed",
            label="Monthly",
            x_col="months",
            y_col="Count",
            group_col="Type",
            query_or_df=df_monthly,
            trace_names=["Created", "Closed"],
        ),
        util.FigureConfig(
            title="Accumulated Notes: Created vs. Closed",
            label="Accumulated",
            x_col="months",
            y_col="Count",
            group_col="Type",
            query_or_df=df_accumulated,
            trace_names=["Created", "Closed"],
        ),
    ]
)

Geographic Distribution of Notes

code
df = duckdb.sql("""
SELECT
    mid_pos_x as x,
    mid_pos_y as y,
    COUNT(*) as z
FROM '../notes_data/*.parquet'
WHERE mid_pos_x IS NOT NULL AND mid_pos_y IS NOT NULL
GROUP BY mid_pos_x, mid_pos_y
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Notes Count",
            x_col="x",
            y_col="y",
            z_col="z",
            query_or_df=df,
            plot_type="map",
        )
    ]
)

Notes Comments Activity

code
df = duckdb.sql("""
SELECT
    YEAR(timestamp) as year,
    MONTH(timestamp) as month,
    CONCAT(YEAR(timestamp), '-', LPAD(CAST(MONTH(timestamp) as VARCHAR), 2, '0')) as months,
    COUNT(*) as "Comments",
    COUNT(DISTINCT user_name) as "Commenters",
    COUNT(DISTINCT note_id) as "Notes with Comments"
FROM '../notes_comments_data/*.parquet'
WHERE user_name != ''
GROUP BY YEAR(timestamp), MONTH(timestamp)
ORDER BY year, month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Comments",
            label="Comments",
            x_col="months",
            y_col="Comments",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Commenters",
            label="Commenters",
            x_col="months",
            y_col="Commenters",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Notes with Comments",
            label="Notes",
            x_col="months",
            y_col="Notes with Comments",
            query_or_df=df,
        ),
    ]
)

Comment Action Types

code
df = duckdb.sql("""
SELECT
    YEAR(timestamp) as year,
    MONTH(timestamp) as month,
    CONCAT(YEAR(timestamp), '-', LPAD(CAST(MONTH(timestamp) as VARCHAR), 2, '0')) as months,
    action,
    COUNT(*) as "Count"
FROM '../notes_comments_data/*.parquet'
GROUP BY YEAR(timestamp), MONTH(timestamp), action
ORDER BY year, month, action
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Comment Actions",
            label="Actions",
            x_col="months",
            y_col="Count",
            group_col="action",
            query_or_df=df,
        )
    ]
)

Average Time to Close Notes

code
df = duckdb.sql("""
SELECT
    YEAR(created_at) as year,
    MONTH(created_at) as month,
    CONCAT(YEAR(created_at), '-', LPAD(CAST(MONTH(created_at) as VARCHAR), 2, '0')) as months,
    AVG(DATEDIFF('day', created_at, closed_at)) as "Average Days to Close",
    MEDIAN(DATEDIFF('day', created_at, closed_at)) as "Median Days to Close",
    COUNT(*) as "Closed Notes"
FROM '../notes_data/*.parquet'
WHERE closed_at IS NOT NULL
GROUP BY YEAR(created_at), MONTH(created_at)
ORDER BY year, month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Average Days to Close Notes",
            label="Average Days",
            x_col="months",
            y_col="Average Days to Close",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Median Days to Close Notes",
            label="Median Days",
            x_col="months",
            y_col="Median Days to Close",
            query_or_df=df,
        ),
    ]
)