OpenStreetMap Changeset Discussions

This notebook analyzes discussions and comments on OpenStreetMap changesets. Changeset discussions allow mappers to communicate about specific changes, ask questions, report issues or provide feedback.

Monthly Discussion Activity

code
df = duckdb.sql("""
WITH comment_dates AS (
    SELECT
        YEAR(date) as year,
        MONTH(date) as month,
        CONCAT(CAST(YEAR(date) AS VARCHAR), '-', LPAD(CAST(MONTH(date) AS VARCHAR), 2, '0')) as months,
        user_name,
        changeset_id,
        text
    FROM '../changeset_comments_data/*.parquet'
),
user_first_comment AS (
    SELECT
        user_name,
        year,
        month,
        ROW_NUMBER() OVER (PARTITION BY user_name ORDER BY year, month) as rn
    FROM (
        SELECT DISTINCT user_name, year, month
        FROM comment_dates
    )
),
first_commenters AS (
    SELECT user_name, year, month
    FROM user_first_comment
    WHERE rn = 1
),
monthly_metrics AS (
    SELECT
        year,
        month,
        months,
        COUNT(*) as "Comments",
        COUNT(DISTINCT user_name) as "Commenters",
        COUNT(DISTINCT changeset_id) as "Changesets with Comments"
    FROM comment_dates
    GROUP BY year, month, months
),
monthly_new_commenters AS (
    SELECT
        year,
        month,
        COUNT(DISTINCT user_name) as "New Commenters"
    FROM first_commenters
    GROUP BY year, month
),
combined_metrics AS (
    SELECT
        m.year,
        m.month,
        m.months,
        m.Comments,
        m.Commenters,
        COALESCE(n."New Commenters", 0) as "New Commenters",
        m."Changesets with Comments"
    FROM monthly_metrics m
    LEFT JOIN monthly_new_commenters n ON m.year = n.year AND m.month = n.month
)
SELECT
    months,
    Comments,
    Commenters,
    "New Commenters",
    "Changesets with Comments",
    SUM(Comments) OVER (ORDER BY year, month) as "Accumulated Comments",
    SUM(Commenters) OVER (ORDER BY year, month) as "Accumulated Commenters"
FROM combined_metrics
ORDER BY year, month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Comments",
            label="Comments",
            x_col="months",
            y_col="Comments",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Commenters",
            label="Commenters",
            x_col="months",
            y_col="Commenters",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly New Commenters",
            label="New Commenters",
            x_col="months",
            y_col="New Commenters",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Changesets with Comments",
            label="Changesets",
            x_col="months",
            y_col="Changesets with Comments",
            query_or_df=df,
        ),
    ]
)

Accumulated Discussion Activity

code
util.show_figure(
    [
        util.FigureConfig(
            title="Accumulated Comments",
            label="Accumulated Comments",
            x_col="months",
            y_col="Accumulated Comments",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Commenters",
            label="Accumulated Commenters",
            x_col="months",
            y_col="Accumulated Commenters",
            query_or_df=df,
        ),
    ]
)

Comment Length Distribution

code
df = duckdb.sql("""
WITH comment_lengths AS (
    SELECT
        YEAR(date) as year,
        MONTH(date) as month,
        CONCAT(CAST(YEAR(date) AS VARCHAR), '-', LPAD(CAST(MONTH(date) AS VARCHAR), 2, '0')) as months,
        LENGTH(text) as comment_length
    FROM '../changeset_comments_data/*.parquet'
    WHERE text IS NOT NULL
)
SELECT
    months,
    CAST(AVG(comment_length) as INTEGER) as "Average Comment Length",
    CAST(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY comment_length) as INTEGER) as "Median Comment Length"
FROM comment_lengths
GROUP BY year, month, months
ORDER BY year, month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Average Comment Length (Characters)",
            label="Average Length",
            x_col="months",
            y_col="Average Comment Length",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Median Comment Length (Characters)",
            label="Median Length",
            x_col="months",
            y_col="Median Comment Length",
            query_or_df=df,
        ),
    ]
)

Top 10 hashtags with associated comment metrics

code
df = duckdb.sql("""
WITH comments AS (
    SELECT
        c.changeset_id,
        YEAR(c.date) AS year,
        MONTH(c.date) AS month,
        CONCAT(CAST(YEAR(c.date) AS VARCHAR), '-', LPAD(CAST(MONTH(c.date) AS VARCHAR), 2, '0')) AS months,
        c.user_name AS commenter_name,
        c.text
    FROM '../changeset_comments_data/*.parquet' c
),
changesets AS (
    SELECT
        changeset_id,
        unnest(hashtags) AS hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
-- First expand all hashtags for counting edits
hashtag_expanded AS (
    SELECT 
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
-- Get top 10 hashtags by total edits (overall usage)
top_hashtags AS (
    SELECT hashtag
    FROM (
        SELECT
            hashtag,
            SUM(edit_count) as total_edits
        FROM hashtag_expanded
        GROUP BY hashtag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
-- Include all comments, even those with no hashtag
joined AS (
    SELECT
        co.year,
        co.month,
        co.months,
        COALESCE(ch.hashtag, 'No Hashtag') AS hashtag,
        co.commenter_name,
        co.changeset_id
    FROM comments co
    LEFT JOIN changesets ch ON co.changeset_id = ch.changeset_id
),
monthly_hashtag_metrics AS (
    SELECT
        year,
        month,
        months,
        hashtag,
        COUNT(*) AS "Comments",
        COUNT(DISTINCT commenter_name) AS "Commenters",
        COUNT(DISTINCT changeset_id) AS "Changesets with Comments"
    FROM joined
    GROUP BY year, month, months, hashtag
)
SELECT
    months,
    hashtag,
    "Comments",
    "Commenters",
    "Changesets with Comments",
    SUM("Comments") OVER (PARTITION BY hashtag ORDER BY year, month) AS "Accumulated Comments",
    SUM("Commenters") OVER (PARTITION BY hashtag ORDER BY year, month) AS "Accumulated Commenters"
FROM monthly_hashtag_metrics
WHERE hashtag IN (SELECT hashtag FROM top_hashtags) OR hashtag = 'No Hashtag'
ORDER BY year, month, hashtag
""").df()

df["Comments per Changeset"] = df["Comments"] / df["Changesets with Comments"]
df["Commenters per Changeset"] = df["Commenters"] / df["Changesets with Comments"]

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Comments by Top 10 Hashtags (plus No Hashtag)",
            label="Comments",
            x_col="months",
            y_col="Comments",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Comments by Top 10 Hashtags (plus No Hashtag)",
            label="Accumulated Comments",
            x_col="months",
            y_col="Accumulated Comments",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Commenters by Top 10 Hashtags (plus No Hashtag)",
            label="Commenters",
            x_col="months",
            y_col="Commenters",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Average Comments per Changeset (by Hashtag, plus No Hashtag)",
            label="Comments per Changeset",
            x_col="months",
            y_col="Comments per Changeset",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Average Commenters per Changeset (by Hashtag, plus No Hashtag)",
            label="Commenters per Changeset",
            x_col="months",
            y_col="Commenters per Changeset",
            group_col="hashtag",
            query_or_df=df,
        ),
    ]
)

Are changeset comments written by the changeset user or by others?

code
df_comment_type = duckdb.sql("""
WITH comments AS (
    SELECT
        c.changeset_id,
        YEAR(c.date) AS year,
        MONTH(c.date) AS month,
        CONCAT(CAST(YEAR(c.date) AS VARCHAR), '-', LPAD(CAST(MONTH(c.date) AS VARCHAR), 2, '0')) AS months,
        c.user_name AS commenter_name
    FROM '../changeset_comments_data/*.parquet' c
),
changesets AS (
    SELECT
        changeset_id,
        user_name AS author_name
    FROM '../changeset_data/year=*/month=*/*.parquet'
),
joined AS (
    SELECT  
        co.year,
        co.month,
        co.months,
        CASE
            WHEN co.commenter_name = ch.author_name THEN 'Author'
            ELSE 'Others'
        END AS comment_type,
        co.changeset_id
    FROM comments co
    JOIN changesets ch ON co.changeset_id = ch.changeset_id
),
monthly_comment_type AS (
    SELECT
        year,
        month,
        months,
        comment_type,
        COUNT(*) AS "Comments",
    FROM joined
    GROUP BY year, month, months, comment_type
)
SELECT
    months,
    comment_type,
    "Comments",
    SUM("Comments") OVER (PARTITION BY comment_type ORDER BY year, month) AS "Accumulated Comments"
FROM monthly_comment_type
ORDER BY year, month, comment_type
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Comments by Comment Type (Author vs Others)",
            label="Comments",
            x_col="months",
            y_col="Comments",
            group_col="comment_type",
            query_or_df=df_comment_type,
        ),
        util.FigureConfig(
            title="Accumulated Comments by Comment Type (Author vs Others)",
            label="Accumulated Comments",
            x_col="months",
            y_col="Accumulated Comments",
            group_col="comment_type",
            query_or_df=df_comment_type,
        ),
    ]
)