Hashtags Analysis

Analysis of OpenStreetMap changesets that use hashtags from the hashtags column.

Monthly Percentage of Edits and Contributors Using Hashtags

code
df = duckdb.sql("""
WITH monthly_with_hashtags AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as contributors_with_hashtags,
        SUM(edit_count) as edits_with_hashtags
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
    GROUP BY year, month
),
monthly_total AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as total_contributors,
        SUM(edit_count) as total_edits
    FROM '../changeset_data/year=*/month=*/*.parquet'
    GROUP BY year, month
)
SELECT 
    mt.months,
    COALESCE(ROUND((mwh.contributors_with_hashtags * 100.0) / mt.total_contributors, 2), 0) as 'Percentage Contributors with Hashtags',
    COALESCE(ROUND((mwh.edits_with_hashtags * 100.0) / mt.total_edits, 2), 0) as 'Percentage Edits with Hashtags'
FROM monthly_total mt
LEFT JOIN monthly_with_hashtags mwh ON mt.year = mwh.year AND mt.month = mwh.month
ORDER BY mt.year, mt.month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Percentage of Contributors Using Hashtags",
            label="Contributors",
            x_col="months",
            y_col="Percentage Contributors with Hashtags",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Percentage of Edits Using Hashtags",
            label="Edits",
            x_col="months",
            y_col="Percentage Edits with Hashtags",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
    ]
)

Monthly Top 10 Hashtags: Edits, Accumulated Edits, Contributors and Accumulated Contributors

code
# Get top 10 hashtags by total edits
df = duckdb.sql("""
WITH hashtag_expanded AS (
    SELECT 
        year,
        month,
        user_name,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
top_hashtags AS (
    SELECT hashtag
    FROM (
        SELECT
            hashtag,
            SUM(edit_count) as total_edits
        FROM hashtag_expanded
        GROUP BY hashtag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
monthly_hashtag_data AS (
    SELECT 
        he.year,
        he.month,
        CONCAT(he.year, '-', LPAD(CAST(he.month as VARCHAR), 2, '0')) as months,
        he.hashtag,
        COUNT(DISTINCT he.user_name) as "Contributors",
        SUM(he.edit_count) as "Edits"
    FROM hashtag_expanded he
    WHERE he.hashtag IN (SELECT hashtag FROM top_hashtags)
    GROUP BY he.year, he.month, he.hashtag
)
SELECT 
    months,
    hashtag,
    "Contributors",
    "Edits",
    SUM("Contributors") OVER (PARTITION BY hashtag ORDER BY year, month) as "Contributors Accumulated",
    SUM("Edits") OVER (PARTITION BY hashtag ORDER BY year, month) as "Edits Accumulated"
FROM monthly_hashtag_data
ORDER BY year, month, hashtag
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Edits by Top 10 Hashtags",
            label="Edits",
            x_col="months",
            y_col="Edits",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Edits by Top 10 Hashtags",
            label="Edits Accumulated",
            x_col="months",
            y_col="Edits Accumulated",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Contributors by Top 10 Hashtags",
            label="Contributors",
            x_col="months",
            y_col="Contributors",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Contributors by Top 10 Hashtags",
            label="Contributors Accumulated",
            x_col="months",
            y_col="Contributors Accumulated",
            group_col="hashtag",
            query_or_df=df,
        ),
    ]
)

Top 100 Hashtags Yearly

code
query = """
WITH hashtag_expanded AS (
    SELECT 
        year,
        user_name,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
user_first_year AS (
    SELECT 
        user_name,
        hashtag,
        MIN(year) as first_year
    FROM hashtag_expanded
    GROUP BY user_name, hashtag
),
hashtag_totals AS (
    SELECT
        hashtag as "Hashtag",
        CAST(SUM(edit_count) as BIGINT) as total_edits_all_time,
        CAST(SUM(CASE WHEN year >= 2021 THEN edit_count ELSE 0 END) as BIGINT) as total_edits_2021_now,
        CAST(COUNT(DISTINCT user_name) as BIGINT) as total_contributors_all_time,
        CAST(COUNT(DISTINCT CASE WHEN year >= 2021 THEN user_name END) as BIGINT) as total_contributors_2021_now
    FROM hashtag_expanded
    GROUP BY hashtag
),
yearly_metrics AS (
    SELECT
        he.year,
        he.hashtag as "Hashtag",
        CAST(SUM(he.edit_count) as BIGINT) as "Edits",
        CAST(COUNT(DISTINCT he.user_name) as BIGINT) as "Contributors",
        CAST(COUNT(DISTINCT CASE WHEN ufy.first_year = he.year THEN he.user_name END) as BIGINT) as "New Contributors"
    FROM hashtag_expanded he
    LEFT JOIN user_first_year ufy 
        ON he.user_name = ufy.user_name AND he.hashtag = ufy.hashtag
    GROUP BY he.year, he.hashtag
)
SELECT 
    ym.year,
    ym."Hashtag",
    ym."Edits",
    ym."New Contributors",
    ym."Contributors",
    ht.total_edits_all_time as "Total Edits",
    ht.total_edits_2021_now as "Total Edits (2021 - Now)",
    ht.total_contributors_all_time as "Total Contributors",
    ht.total_contributors_2021_now as "Total Contributors (2021 - Now)"
FROM yearly_metrics ym
JOIN hashtag_totals ht
    ON ym."Hashtag" = ht."Hashtag"
ORDER BY year DESC, "Edits" DESC
"""
df = duckdb.sql(query).df()

top_100_contributors = df.groupby("Hashtag")["Total Contributors"].first().nlargest(100)
top_100_contributors_2021_now = df.groupby("Hashtag")["Total Contributors (2021 - Now)"].first().nlargest(100)
top_100_edits = df.groupby("Hashtag")["Total Edits"].first().nlargest(100)
top_100_edits_2021_now = df.groupby("Hashtag")["Total Edits (2021 - Now)"].first().nlargest(100)

table_configs = [
    util.TableConfig(
        title="Top 100 Hashtags by Contributors",
        query_or_df=df[df["Hashtag"].isin(top_100_contributors.index)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Contributors",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Contributors",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Contributors 2021 - Now",
        query_or_df=df[(df["Hashtag"].isin(top_100_contributors_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Contributors",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Contributors (2021 - Now)",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Edits",
        query_or_df=df[df["Hashtag"].isin(top_100_edits.index)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Edits",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Edits",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Edits 2021 - Now",
        query_or_df=df[(df["Hashtag"].isin(top_100_edits_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Edits",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Edits (2021 - Now)",
    ),
]

util.show_tables(table_configs)
Rank Hashtag 2017 2018 2019 2020 2021 2022 2023 2024 2025 Total Contributors
1 #missingmaps 12,561 28,330 26,727 28,770 35,232 10,913 12,791 9,616 5,948 151,269
2 #redcross 7,849 8,209 7,332 19,818 25,801 3,043 688 1,071 285 67,882
3 #msf 724 5,128 8,456 4,899 5,714 4,988 6,094 6,319 4,670 41,936
4 #opencitieslac 0 0 0 0 9,244 16,658 9,565 125 36 32,844
5 #youthmappers 1,601 2,916 4,288 5,469 9,179 4,429 2,870 2,408 1,818 31,117
6 #indonesia 1 1,622 2,031 14,365 14,434 371 51 3 3 30,599
7 #covid19 0 0 0 21,990 6,691 283 154 72 1 28,072
8 #climatechange 0 0 4 8,012 19,957 462 0 160 8 27,036
9 #accenture 548 1,870 2,085 4,814 5,614 1,730 2,065 1,792 1,378 20,118
10 #bangladesh 3,191 3,011 161 675 7,919 3,644 186 521 48 18,624
11 #jpmc 141 1,018 1,256 2,815 4,305 4,828 2,222 2,532 21 17,262
12 #tanzaniadevelopmenttrust 1,383 4,409 3,894 3,425 1,157 1,173 1,232 429 15 15,881
13 #educategirls 0 0 0 12,563 2,757 177 147 127 0 15,543
14 #amm 781 3,299 2,378 4,597 2,630 1,824 1,790 1,175 261 15,027
15 #nigeria 1,097 3,496 2,555 2,177 2,608 878 141 26 31 12,369
16 #osmph 3 621 2,994 5,091 1,368 472 487 1,392 19 11,871
17 #buildings 478 119 107 2,966 3,795 1,999 1,059 1,686 95 11,369
18 #salesforce 57 3,251 1,234 3,726 1,216 665 278 1,836 287 11,358
19 #aphub 0 0 0 0 16 0 1,575 5,630 5,059 11,349
20 #liga-zambia 0 0 0 0 11,124 111 134 0 0 11,319
21 #mapgive 3,012 1,804 708 1,178 3,488 533 197 447 2 10,916
22 #maproulette 154 473 1,426 1,779 1,564 1,676 2,769 2,997 2,637 10,425
23 #msft 1,514 2,100 1,542 2,558 2,586 35 131 661 265 10,416
24 #myanmar 3,013 4,336 2,435 418 334 27 22 2 3 10,095
25 #covbots 0 0 0 7,617 1,853 100 71 4 0 9,346
26 #dronebird 1 659 2,321 1,257 1,656 385 523 361 1,937 8,744
27 #türkiyeeq060223 0 0 0 0 0 0 8,719 3 2 8,721
28 #tanzania 1,383 2,295 740 2,004 1,066 695 595 435 33 8,445
29 #upnoah 36 820 2,991 3,463 1,100 212 0 2 0 8,330
30 #osmgeoweek 170 297 162 460 3,370 1,873 1,321 1,215 1 8,313
31 #omguru 0 0 0 0 0 0 364 6,276 2,118 8,257
32 #galschool 0 154 1,257 6,856 7 5 0 0 0 8,123
33 #msftgive 1,251 1,561 1,581 2,496 2,013 44 39 5 13 8,114
34 #hotmicrogrants2020 0 0 0 7,676 148 122 77 84 0 8,057
35 #mapph 7 595 2,884 3,425 969 191 0 3 0 7,813
36 #foodsecurityph 0 595 2,879 3,424 965 192 0 2 0 7,797
37 #furuhashilab 0 31 1,584 1,269 1,695 494 544 364 2,022 7,699
38 #yercizenler 0 554 415 946 247 51 5,379 202 59 7,498
39 #rodekruis 2,278 1,787 1,435 1,050 633 224 75 439 64 7,468
40 #missingmaps_drk 0 0 254 1,343 3,154 1,966 397 455 217 7,373
41 #awsmapathon 0 159 38 675 675 1,500 3,338 1,285 44 7,092
42 #maplibya 0 0 0 41 2,937 1,817 816 1,865 1 7,002
43 #uniquemappersnetwork 0 0 262 560 3,404 1,801 653 306 580 6,997
44 #crisismappersjapan 52 20 2,049 1,167 1,201 258 329 329 1,778 6,992
45 #lnsdi 0 0 0 0 2,927 1,816 816 1,865 1 6,957
46 #uniquemappersteam 20 232 29 465 3,381 1,862 616 304 578 6,938
47 #mapimpacto 0 0 0 6,667 45 5 0 0 0 6,713
48 #ourimpact 0 0 0 1,389 891 1,325 1,835 2,073 292 6,620
49 #tt_event 0 0 0 0 0 0 26 2,934 4,116 6,570
50 #msgivesback 0 0 0 1,106 2,133 2,275 810 944 10 6,384
51 #maptripoli 0 0 0 0 2,881 1,719 702 1,507 1 6,364
52 #nigerstate 1,094 2,171 907 1,696 347 99 1 0 0 6,076
53 #lndti 0 0 0 0 1,940 1,816 815 1,865 1 6,013
54 #cartong 137 1,565 786 1,032 1,434 676 386 267 206 5,978
55 #covjam 0 0 0 5,670 137 0 0 0 0 5,787
56 #hurricanemaria 5,763 4 0 1 0 0 0 0 0 5,764
57 #kenya 174 869 3,805 676 220 172 7 24 6 5,743
58 #osm_in 0 2,820 2,592 458 5 1 0 0 0 5,648
59 #puertorico 5,636 2 0 5 0 0 0 0 0 5,641
60 #osmnigeria 0 29 281 557 3,318 1,627 227 1 0 5,610
61 #unmappers 0 0 324 471 1,939 1,335 778 343 879 5,592
62 #water 11 11 14 15 2,018 1,832 897 1,186 15 5,576
63 #disasterresponse 0 0 1 0 309 776 583 3,186 1,140 5,517
64 #moroccoearthquake2023 0 0 0 0 0 0 4,727 1,027 4 5,510
65 #infrastructure 0 4 0 2 1,943 1,814 872 1,171 3 5,404
66 #vegetation 0 0 0 0 1,936 1,813 878 1,171 2 5,397
67 #landuselandcover 0 0 0 0 1,937 1,813 871 1,170 1 5,390
68 #streetsroads 0 0 0 0 1,936 1,812 872 1,169 1 5,388
69 #soil 0 0 0 0 1,935 1,810 872 1,169 1 5,386
70 #air 0 1 1 1 1,930 1,801 870 1,169 4 5,375
71 #osm 39 63 97 456 2,958 541 209 1,087 71 5,301
72 #omhap 0 0 0 0 0 628 4,526 37 12 5,092
73 #uniquemappersteam2021 0 0 0 0 3,404 1,616 226 1 0 4,923
74 #hotph 0 0 1,758 2,894 388 12 9 31 0 4,875
75 #turkey 0 1 9 5 3 4 4,810 13 1 4,828
76 #srilanka 3 2,474 5 12 19 46 2,308 1 3 4,828
77 #turkiye 0 0 0 0 1 0 4,813 7 0 4,817
78 #withrefugees 628 3,803 413 24 15 3 0 0 0 4,781
79 #osmindia 0 0 0 0 1 7 449 3,872 635 4,750
80 #syria 0 1 1 1 1 0 4,532 164 63 4,738
81 #philaware 0 0 1,758 2,852 274 11 8 31 0 4,729
82 #colombia 235 2,602 736 602 652 12 1 7 5 4,728
83 #victornsunday 0 0 260 556 2,623 1,346 225 1 0 4,725
84 #bloomberg 1 0 344 1,866 1,185 501 664 670 82 4,620
85 #madagascar 147 305 15 5 2,000 2,323 6 3 1 4,619
86 #chad 0 279 2,183 522 188 1,408 215 1 0 4,592
87 #drc 2 356 1,427 792 179 79 369 1,391 169 4,529
88 #rccolombia 232 2,598 523 565 650 12 0 0 0 4,480
89 #nigeriafloods2021 0 0 0 0 2,491 2,043 175 8 0 4,467
90 #osmafrica 0 0 1 0 4 2,796 894 846 120 4,403
91 #mapuganda 230 2,465 22 579 375 599 152 97 64 4,390
92 #womenconnect 0 0 3,157 1,275 6 3 0 0 0 4,294
93 #osmnepal 7 26 25 390 396 1,187 1,437 1,260 72 4,281
94 #ngdisasterriskreduction 0 0 262 527 2,510 1,004 208 1 0 4,273
95 #youthmappersagu 0 0 0 30 1,419 433 420 137 1,920 4,256
96 #gomicron21twsg 0 0 0 0 4,248 0 0 0 0 4,248
97 #cycloneidai 0 0 4,207 9 3 1 1 0 0 4,212
98 #mapforpreparedness 307 104 1,298 1,801 821 3 0 0 0 4,211
99 #osmbd 117 82 26 6 48 3,463 270 259 47 4,099
100 #mozambique 5 18 2,920 218 87 871 4 6 0 4,064

Geographical Distribution of Top 10 Hashtags

code
# Create geographical maps for top 10 hashtags
df = duckdb.sql("""
WITH hashtag_expanded AS (
    SELECT 
        mid_pos_x,
        mid_pos_y,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL 
      AND mid_pos_x IS NOT NULL 
      AND mid_pos_y IS NOT NULL
),
top_hashtags AS (
    SELECT hashtag
    FROM (
        SELECT
            hashtag,
            SUM(edit_count) as total_edits
        FROM hashtag_expanded
        GROUP BY hashtag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
hashtag_geo_data AS (
    SELECT 
        he.hashtag,
        he.mid_pos_x as x,
        he.mid_pos_y as y,
        SUM(he.edit_count) as z
    FROM hashtag_expanded he
    WHERE he.hashtag IN (SELECT hashtag FROM top_hashtags)
    GROUP BY he.hashtag, he.mid_pos_x, he.mid_pos_y
)
SELECT * FROM hashtag_geo_data
ORDER BY hashtag, x, y
""").df()

# Create dropdown maps for each hashtag
configs = []
for hashtag in sorted(df["hashtag"].unique()):
    configs.append(
        util.FigureConfig(
            title=f"Edit Distribution - {hashtag}".replace("#", ""),
            x_col="x",
            y_col="y",
            z_col="z",
            query_or_df=df[df["hashtag"] == hashtag],
            plot_type="map",
        )
    )

util.show_figure(configs, type="dropdown")