This notebook analyzes discussions and comments on OpenStreetMap changesets. Changeset discussions allow mappers to communicate about specific changes, ask questions, report issues or provide feedback.
df = duckdb.sql("""
WITH comment_dates AS (
SELECT
YEAR(date) as year,
MONTH(date) as month,
CONCAT(CAST(YEAR(date) AS VARCHAR), '-', LPAD(CAST(MONTH(date) AS VARCHAR), 2, '0')) as months,
user_name,
changeset_id,
text
FROM '../changeset_comments_data/*.parquet'
),
user_first_comment AS (
SELECT
user_name,
year,
month,
ROW_NUMBER() OVER (PARTITION BY user_name ORDER BY year, month) as rn
FROM (
SELECT DISTINCT user_name, year, month
FROM comment_dates
)
),
first_commenters AS (
SELECT user_name, year, month
FROM user_first_comment
WHERE rn = 1
),
monthly_metrics AS (
SELECT
year,
month,
months,
COUNT(*) as "Comments",
COUNT(DISTINCT user_name) as "Commenters",
COUNT(DISTINCT changeset_id) as "Changesets with Comments"
FROM comment_dates
GROUP BY year, month, months
),
monthly_new_commenters AS (
SELECT
year,
month,
COUNT(DISTINCT user_name) as "New Commenters"
FROM first_commenters
GROUP BY year, month
),
combined_metrics AS (
SELECT
m.year,
m.month,
m.months,
m.Comments,
m.Commenters,
COALESCE(n."New Commenters", 0) as "New Commenters",
m."Changesets with Comments"
FROM monthly_metrics m
LEFT JOIN monthly_new_commenters n ON m.year = n.year AND m.month = n.month
)
SELECT
months,
Comments,
Commenters,
"New Commenters",
"Changesets with Comments",
SUM(Comments) OVER (ORDER BY year, month) as "Accumulated Comments",
SUM(Commenters) OVER (ORDER BY year, month) as "Accumulated Commenters"
FROM combined_metrics
ORDER BY year, month
""").df()
util.show_figure(
[
util.FigureConfig(
title="Monthly Comments",
label="Comments",
x_col="months",
y_col="Comments",
query_or_df=df,
),
util.FigureConfig(
title="Monthly Commenters",
label="Commenters",
x_col="months",
y_col="Commenters",
query_or_df=df,
),
util.FigureConfig(
title="Monthly New Commenters",
label="New Commenters",
x_col="months",
y_col="New Commenters",
query_or_df=df,
),
util.FigureConfig(
title="Monthly Changesets with Comments",
label="Changesets",
x_col="months",
y_col="Changesets with Comments",
query_or_df=df,
),
]
)
util.show_figure(
[
util.FigureConfig(
title="Accumulated Comments",
label="Accumulated Comments",
x_col="months",
y_col="Accumulated Comments",
query_or_df=df,
),
util.FigureConfig(
title="Accumulated Commenters",
label="Accumulated Commenters",
x_col="months",
y_col="Accumulated Commenters",
query_or_df=df,
),
]
)
df = duckdb.sql("""
WITH comment_lengths AS (
SELECT
YEAR(date) as year,
MONTH(date) as month,
CONCAT(CAST(YEAR(date) AS VARCHAR), '-', LPAD(CAST(MONTH(date) AS VARCHAR), 2, '0')) as months,
LENGTH(text) as comment_length
FROM '../changeset_comments_data/*.parquet'
WHERE text IS NOT NULL
)
SELECT
months,
CAST(AVG(comment_length) as INTEGER) as "Average Comment Length",
CAST(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY comment_length) as INTEGER) as "Median Comment Length"
FROM comment_lengths
GROUP BY year, month, months
ORDER BY year, month
""").df()
util.show_figure(
[
util.FigureConfig(
title="Average Comment Length (Characters)",
label="Average Length",
x_col="months",
y_col="Average Comment Length",
query_or_df=df,
),
util.FigureConfig(
title="Median Comment Length (Characters)",
label="Median Length",
x_col="months",
y_col="Median Comment Length",
query_or_df=df,
),
]
)
df = duckdb.sql("""
WITH comments AS (
SELECT
c.changeset_id,
YEAR(c.date) AS year,
MONTH(c.date) AS month,
CONCAT(CAST(YEAR(c.date) AS VARCHAR), '-', LPAD(CAST(MONTH(c.date) AS VARCHAR), 2, '0')) AS months,
c.user_name AS commenter_name,
c.text
FROM '../changeset_comments_data/*.parquet' c
),
changesets AS (
SELECT
changeset_id,
unnest(hashtags) AS hashtag
FROM '../changeset_data/year=*/month=*/*.parquet'
WHERE hashtags IS NOT NULL
),
-- First expand all hashtags for counting edits
hashtag_expanded AS (
SELECT
edit_count,
unnest(hashtags) as hashtag
FROM '../changeset_data/year=*/month=*/*.parquet'
WHERE hashtags IS NOT NULL
),
-- Get top 10 hashtags by total edits (overall usage)
top_hashtags AS (
SELECT hashtag
FROM (
SELECT
hashtag,
SUM(edit_count) as total_edits
FROM hashtag_expanded
GROUP BY hashtag
ORDER BY total_edits DESC
LIMIT 10
)
),
-- Include all comments, even those with no hashtag
joined AS (
SELECT
co.year,
co.month,
co.months,
COALESCE(ch.hashtag, 'No Hashtag') AS hashtag,
co.commenter_name,
co.changeset_id
FROM comments co
LEFT JOIN changesets ch ON co.changeset_id = ch.changeset_id
),
monthly_hashtag_metrics AS (
SELECT
year,
month,
months,
hashtag,
COUNT(*) AS "Comments",
COUNT(DISTINCT commenter_name) AS "Commenters",
COUNT(DISTINCT changeset_id) AS "Changesets with Comments"
FROM joined
GROUP BY year, month, months, hashtag
)
SELECT
months,
hashtag,
"Comments",
"Commenters",
"Changesets with Comments",
SUM("Comments") OVER (PARTITION BY hashtag ORDER BY year, month) AS "Accumulated Comments",
SUM("Commenters") OVER (PARTITION BY hashtag ORDER BY year, month) AS "Accumulated Commenters"
FROM monthly_hashtag_metrics
WHERE hashtag IN (SELECT hashtag FROM top_hashtags) OR hashtag = 'No Hashtag'
ORDER BY year, month, hashtag
""").df()
df["Comments per Changeset"] = df["Comments"] / df["Changesets with Comments"]
df["Commenters per Changeset"] = df["Commenters"] / df["Changesets with Comments"]
util.show_figure(
[
util.FigureConfig(
title="Monthly Comments by Top 10 Hashtags (plus No Hashtag)",
label="Comments",
x_col="months",
y_col="Comments",
group_col="hashtag",
query_or_df=df,
),
util.FigureConfig(
title="Accumulated Comments by Top 10 Hashtags (plus No Hashtag)",
label="Accumulated Comments",
x_col="months",
y_col="Accumulated Comments",
group_col="hashtag",
query_or_df=df,
),
util.FigureConfig(
title="Monthly Commenters by Top 10 Hashtags (plus No Hashtag)",
label="Commenters",
x_col="months",
y_col="Commenters",
group_col="hashtag",
query_or_df=df,
),
util.FigureConfig(
title="Average Comments per Changeset (by Hashtag, plus No Hashtag)",
label="Comments per Changeset",
x_col="months",
y_col="Comments per Changeset",
group_col="hashtag",
query_or_df=df,
),
util.FigureConfig(
title="Average Commenters per Changeset (by Hashtag, plus No Hashtag)",
label="Commenters per Changeset",
x_col="months",
y_col="Commenters per Changeset",
group_col="hashtag",
query_or_df=df,
),
]
)
df_comment_type = duckdb.sql("""
WITH comments AS (
SELECT
c.changeset_id,
YEAR(c.date) AS year,
MONTH(c.date) AS month,
CONCAT(CAST(YEAR(c.date) AS VARCHAR), '-', LPAD(CAST(MONTH(c.date) AS VARCHAR), 2, '0')) AS months,
c.user_name AS commenter_name
FROM '../changeset_comments_data/*.parquet' c
),
changesets AS (
SELECT
changeset_id,
user_name AS author_name
FROM '../changeset_data/year=*/month=*/*.parquet'
),
joined AS (
SELECT
co.year,
co.month,
co.months,
CASE
WHEN co.commenter_name = ch.author_name THEN 'Author'
ELSE 'Others'
END AS comment_type,
co.changeset_id
FROM comments co
JOIN changesets ch ON co.changeset_id = ch.changeset_id
),
monthly_comment_type AS (
SELECT
year,
month,
months,
comment_type,
COUNT(*) AS "Comments",
FROM joined
GROUP BY year, month, months, comment_type
)
SELECT
months,
comment_type,
"Comments",
SUM("Comments") OVER (PARTITION BY comment_type ORDER BY year, month) AS "Accumulated Comments"
FROM monthly_comment_type
ORDER BY year, month, comment_type
""").df()
util.show_figure(
[
util.FigureConfig(
title="Monthly Comments by Comment Type (Author vs Others)",
label="Comments",
x_col="months",
y_col="Comments",
group_col="comment_type",
query_or_df=df_comment_type,
),
util.FigureConfig(
title="Accumulated Comments by Comment Type (Author vs Others)",
label="Accumulated Comments",
x_col="months",
y_col="Accumulated Comments",
group_col="comment_type",
query_or_df=df_comment_type,
),
]
)