OpenStreetMap General Statistics

This website contains monthly updated OpenStreetMap statistics. The code is Open Source and can be found on GitHub. If you know SQL, you can also create your own statistics using the preprocessed data. The code for creating the plots can be explored by toggling the code cell.

Contributors, Edits and Changesets Each Month

code

df = duckdb.sql("""
WITH user_first_appearance AS (
    SELECT
        user_name,
        year,
        month,
        ROW_NUMBER() OVER (PARTITION BY user_name ORDER BY year, month) as rn
    FROM (
        SELECT DISTINCT user_name, year, month
        FROM '../changeset_data/year=*/month=*/*.parquet'
    )
),
first_appearances AS (
    SELECT user_name, year, month
    FROM user_first_appearance
    WHERE rn = 1
),
monthly_metrics AS (
    SELECT
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as Contributors,
        CAST(SUM(edit_count) as BIGINT) as Edits,
        CAST(COUNT(*) AS INTEGER) as Changesets
    FROM '../changeset_data/year=*/month=*/*.parquet'
    GROUP BY year, month
),
monthly_new_contributors AS (
    SELECT
        year,
        month,
        COUNT(DISTINCT user_name) as "New Contributors"
    FROM first_appearances
    GROUP BY year, month
),
combined_metrics AS (
    SELECT
        m.year,
        m.month,
        m.months,
        m.Contributors,
        COALESCE(n."New Contributors", 0) as "New Contributors",
        m.Edits,
        m.Changesets
    FROM monthly_metrics m
    LEFT JOIN monthly_new_contributors n ON m.year = n.year AND m.month = n.month
)
SELECT
    months,
    Contributors,
    "New Contributors",
    Edits,
    Changesets,
    SUM("New Contributors") OVER (ORDER BY year, month) as "Accumulated Contributors",
    SUM(Edits) OVER (ORDER BY year, month) as "Accumulated Edits",
    SUM(Changesets) OVER (ORDER BY year, month) as "Accumulated Changesets"
FROM combined_metrics
ORDER BY year, month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Contributors",
            label="Contributors",
            x_col="months",
            y_col="Contributors",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly New Contributors",
            label="New Contributors",
            x_col="months",
            y_col="New Contributors",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Edits",
            label="Edits",
            x_col="months",
            y_col="Edits",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Changesets",
            label="Changesets",
            x_col="months",
            y_col="Changesets",
            query_or_df=df,
        ),
    ]
)

Accumulated Contributors, Edits and Changesets Each Month

code

util.show_figure(
    [
        util.FigureConfig(
            title="Accumulated Contributors",
            label="Accumulated Contributors",
            x_col="months",
            y_col="Accumulated Contributors",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Edits",
            label="Accumulated Edits",
            x_col="months",
            y_col="Accumulated Edits",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Changesets",
            label="Accumulated Changesets",
            x_col="months",
            y_col="Accumulated Changesets",
            query_or_df=df,
        ),
    ]
)

Where are the Edits Happening?

code

df = duckdb.sql("""
SELECT
    mid_pos_x as x,
    mid_pos_y as y,
    SUM(edit_count) as z
FROM '../changeset_data/year=*/month=*/*.parquet'
WHERE mid_pos_x IS NOT NULL AND mid_pos_y IS NOT NULL
GROUP BY mid_pos_x, mid_pos_y
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Edit Count",
            x_col="x",
            y_col="y",
            z_col="z",
            query_or_df=df,
            plot_type="map",
        ),
    ]
)

Contributor Attrition Rate by First Edit Period (Yearly)

This shows the contribution patterns of users grouped by their first edit period (2-year periods), displaying the percentage of total yearly edits from each cohort.

code

sql_query = """
WITH user_first_edit AS (
    -- Find the first edit year for each user
    SELECT 
        user_name,
        MIN(year) as first_edit_year
    FROM 
        read_parquet('../changeset_data/year=*/month=*/*.parquet')
    GROUP BY 
        user_name
),
year_user_edits AS (
    -- Sum edits by year and user
    SELECT 
        year,
        user_name,
        SUM(edit_count) as total_edits
    FROM 
        read_parquet('../changeset_data/year=*/month=*/*.parquet')
    GROUP BY 
        year, user_name
),
merged_data AS (
    -- Join to get first edit year for each user's yearly edits
    SELECT 
        y.year,
        y.user_name,
        y.total_edits,
        u.first_edit_year,
        CASE 
            -- Create 2-year periods based on odd/even years
            WHEN u.first_edit_year % 2 = 1 THEN 
                CONCAT('First edit in: ', CAST(u.first_edit_year AS VARCHAR), '-', CAST(u.first_edit_year + 1 AS VARCHAR))
            ELSE 
                CONCAT('First edit in: ', CAST(u.first_edit_year - 1 AS VARCHAR), '-', CAST(u.first_edit_year AS VARCHAR))
        END as first_edit_period
    FROM 
        year_user_edits y
    JOIN 
        user_first_edit u ON y.user_name = u.user_name
),
grouped_data AS (
    SELECT 
        CAST(year AS VARCHAR) as years,
        first_edit_period,
        SUM(total_edits) as total_edits
    FROM 
        merged_data
    GROUP BY 
        year, first_edit_period
)
SELECT 
    years,
    first_edit_period,
    total_edits,
    ROUND(100.0 * total_edits / SUM(total_edits) OVER (PARTITION BY years), 2) as percentage
FROM 
    grouped_data
ORDER BY 
    years, first_edit_period
"""
df = duckdb.sql(sql_query).df()

util.show_figure(
    [
        util.FigureConfig(
            title="Contributor Attrition Rate by First Edit Period",
            label="Absolute",
            x_col="years",
            y_col="total_edits",
            query_or_df=df,
            group_col="first_edit_period",
            plot_type="bar",
            trace_names=util.get_trace_names(df, "first_edit_period", "years", "total_edits", "unique"),
        ),
        util.FigureConfig(
            title="Percentage of Edits by First Edit Period",
            label="Percentage",
            x_col="years",
            y_col="percentage",
            query_or_df=df,
            group_col="first_edit_period",
            plot_type="bar",
        ),
    ]
)

Contributors with More than K Edits Total

This shows the number of contributors who have accumulated more than various edit count thresholds (10, 100, 1000, 10000, 100000 edits) throughout their entire OSM history up to each month.

code

df = duckdb.sql("""
WITH user_cumulative_edits AS (
    SELECT
        year,
        month,
        user_name,
        SUM(edit_count) OVER (PARTITION BY user_name ORDER BY year, month) as cumulative_edits
    FROM (
        SELECT
            year,
            month,
            user_name,
            SUM(edit_count) as edit_count
        FROM '../changeset_data/year=*/month=*/*.parquet'
        GROUP BY year, month, user_name
    )
),
monthly_thresholds AS (
    SELECT
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT CASE WHEN cumulative_edits > 10 THEN user_name END) as "more then 10 edits",
        COUNT(DISTINCT CASE WHEN cumulative_edits > 100 THEN user_name END) as "more then 100 edits",
        COUNT(DISTINCT CASE WHEN cumulative_edits > 1000 THEN user_name END) as "more then 1000 edits",
        COUNT(DISTINCT CASE WHEN cumulative_edits > 10000 THEN user_name END) as "more then 10000 edits",
        COUNT(DISTINCT CASE WHEN cumulative_edits > 100000 THEN user_name END) as "more then 100000 edits"
    FROM user_cumulative_edits
    GROUP BY year, month
)
SELECT
    months,
    "more then 10 edits",
    "more then 100 edits",
    "more then 1000 edits",
    "more then 10000 edits",
    "more then 100000 edits"
FROM monthly_thresholds
ORDER BY year, month
""").df()

# Reshape for plotting
df_melted = df.melt(id_vars=["months"], var_name="threshold", value_name="contributors")

util.show_figure(
    [
        util.FigureConfig(
            title="Contributors with More than K Edits Total",
            label="contributors",
            x_col="months",
            y_col="contributors",
            group_col="threshold",
            query_or_df=df_melted,
            trace_names=[
                "more then 10 edits",
                "more then 100 edits",
                "more then 1000 edits",
                "more then 10000 edits",
                "more then 100000 edits",
            ],
        )
    ]
)

New Contributors without MAPS.ME and only MAPS.ME

There is a big spike of new contributors in April 2016. This is because a lot of users using MAPS.ME were creating their first edits.

code

sql_query = """
WITH user_first_edit AS (
    -- Find the first edit month for each user and determine if they used MAPS.ME
    SELECT 
        user_name,
        MIN(CONCAT(CAST(year AS VARCHAR), '-', LPAD(CAST(month AS VARCHAR), 2, '0'))) as first_edit_month,
        -- Check if user ever used MAPS.ME
        BOOL_OR(CASE WHEN created_by LIKE '%MAPS.ME%' THEN true ELSE false END) as used_maps_me
    FROM 
        read_parquet('../changeset_data/year=*/month=*/*.parquet')
    GROUP BY 
        user_name
)
SELECT 
    first_edit_month as Months,
    COUNT(*) as 'New Contributors',
    CASE 
        WHEN used_maps_me THEN 'Only MAPS.ME'
        ELSE 'Without MAPS.ME'
    END as group_column
FROM 
    user_first_edit
GROUP BY 
    first_edit_month, used_maps_me
ORDER BY 
    first_edit_month, group_column
"""

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly New Contributors with and without MAPS.ME",
            x_col="Months",
            y_col="New Contributors",
            query_or_df=sql_query,
            group_col="group_column",
        )
    ]
)

Median Number of Edits per Contributor (Yearly)

This shows the median number of edits made by contributors each year, providing insight into typical contributor activity levels.

code

df_median_edits_yearly = duckdb.sql("""
WITH yearly_contributor_edits AS (
    SELECT
        year,
        user_name,
        CAST(SUM(edit_count) as INTEGER) as user_edits
    FROM '../changeset_data/year=*/month=*/*.parquet'
    GROUP BY year, user_name
)
SELECT
    CAST(year AS VARCHAR) as Years,
    CAST(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY user_edits) as INTEGER) as "Median edits per contributor"
FROM yearly_contributor_edits
GROUP BY year
ORDER BY year
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Median Number of Edits per Contributor",
            x_col="Years",
            y_col="Median edits per contributor",
            query_or_df=df_median_edits_yearly,
        ),
    ],
)

Where are Edits Happening Each Year?

code

df = duckdb.sql("""
SELECT
    year,
    mid_pos_x as x,
    mid_pos_y as y,
    SUM(edit_count) as z
FROM '../changeset_data/year=*/month=*/*.parquet'
WHERE mid_pos_x IS NOT NULL AND mid_pos_y IS NOT NULL
GROUP BY year, mid_pos_x, mid_pos_y
""").df()

configs = []
for year in sorted(df["year"].unique()):
    configs.append(
        util.FigureConfig(
            title=f"Edit Count {year}",
            x_col="x",
            y_col="y",
            z_col="z",
            query_or_df=df[df["year"] == year],
            plot_type="map",
        )
    )
util.show_figure(configs, type="dropdown")