Hashtags Analysis

Analysis of OpenStreetMap changesets that use hashtags from the hashtags column.

Monthly Percentage of Edits and Contributors Using Hashtags

code
df = duckdb.sql("""
WITH monthly_with_hashtags AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as contributors_with_hashtags,
        SUM(edit_count) as edits_with_hashtags
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
    GROUP BY year, month
),
monthly_total AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as total_contributors,
        SUM(edit_count) as total_edits
    FROM '../changeset_data/year=*/month=*/*.parquet'
    GROUP BY year, month
)
SELECT 
    mt.months,
    COALESCE(ROUND((mwh.contributors_with_hashtags * 100.0) / mt.total_contributors, 2), 0) as 'Percentage Contributors with Hashtags',
    COALESCE(ROUND((mwh.edits_with_hashtags * 100.0) / mt.total_edits, 2), 0) as 'Percentage Edits with Hashtags'
FROM monthly_total mt
LEFT JOIN monthly_with_hashtags mwh ON mt.year = mwh.year AND mt.month = mwh.month
ORDER BY mt.year, mt.month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Percentage of Contributors Using Hashtags",
            label="Contributors",
            x_col="months",
            y_col="Percentage Contributors with Hashtags",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Percentage of Edits Using Hashtags",
            label="Edits",
            x_col="months",
            y_col="Percentage Edits with Hashtags",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
    ]
)

Monthly Top 10 Hashtags: Edits, Accumulated Edits, Contributors and Accumulated Contributors

code
# Get top 10 hashtags by total edits
df = duckdb.sql("""
WITH hashtag_expanded AS (
    SELECT 
        year,
        month,
        user_name,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
top_hashtags AS (
    SELECT hashtag
    FROM (
        SELECT
            hashtag,
            SUM(edit_count) as total_edits
        FROM hashtag_expanded
        GROUP BY hashtag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
monthly_hashtag_data AS (
    SELECT 
        he.year,
        he.month,
        CONCAT(he.year, '-', LPAD(CAST(he.month as VARCHAR), 2, '0')) as months,
        he.hashtag,
        COUNT(DISTINCT he.user_name) as "Contributors",
        SUM(he.edit_count) as "Edits"
    FROM hashtag_expanded he
    WHERE he.hashtag IN (SELECT hashtag FROM top_hashtags)
    GROUP BY he.year, he.month, he.hashtag
)
SELECT 
    months,
    hashtag,
    "Contributors",
    "Edits",
    SUM("Contributors") OVER (PARTITION BY hashtag ORDER BY year, month) as "Contributors Accumulated",
    SUM("Edits") OVER (PARTITION BY hashtag ORDER BY year, month) as "Edits Accumulated"
FROM monthly_hashtag_data
ORDER BY year, month, hashtag
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Edits by Top 10 Hashtags",
            label="Edits",
            x_col="months",
            y_col="Edits",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Edits by Top 10 Hashtags",
            label="Edits Accumulated",
            x_col="months",
            y_col="Edits Accumulated",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Contributors by Top 10 Hashtags",
            label="Contributors",
            x_col="months",
            y_col="Contributors",
            group_col="hashtag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Contributors by Top 10 Hashtags",
            label="Contributors Accumulated",
            x_col="months",
            y_col="Contributors Accumulated",
            group_col="hashtag",
            query_or_df=df,
        ),
    ]
)

Top 100 Hashtags Yearly

code
query = """
WITH hashtag_expanded AS (
    SELECT 
        year,
        user_name,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL
),
user_first_year AS (
    SELECT 
        user_name,
        hashtag,
        MIN(year) as first_year
    FROM hashtag_expanded
    GROUP BY user_name, hashtag
),
hashtag_totals AS (
    SELECT
        hashtag as "Hashtag",
        CAST(SUM(edit_count) as BIGINT) as total_edits_all_time,
        CAST(SUM(CASE WHEN year >= 2021 THEN edit_count ELSE 0 END) as BIGINT) as total_edits_2021_now,
        CAST(COUNT(DISTINCT user_name) as BIGINT) as total_contributors_all_time,
        CAST(COUNT(DISTINCT CASE WHEN year >= 2021 THEN user_name END) as BIGINT) as total_contributors_2021_now
    FROM hashtag_expanded
    GROUP BY hashtag
),
yearly_metrics AS (
    SELECT
        he.year,
        he.hashtag as "Hashtag",
        CAST(SUM(he.edit_count) as BIGINT) as "Edits",
        CAST(COUNT(DISTINCT he.user_name) as BIGINT) as "Contributors",
        CAST(COUNT(DISTINCT CASE WHEN ufy.first_year = he.year THEN he.user_name END) as BIGINT) as "New Contributors"
    FROM hashtag_expanded he
    LEFT JOIN user_first_year ufy 
        ON he.user_name = ufy.user_name AND he.hashtag = ufy.hashtag
    GROUP BY he.year, he.hashtag
)
SELECT 
    ym.year,
    ym."Hashtag",
    ym."Edits",
    ym."New Contributors",
    ym."Contributors",
    ht.total_edits_all_time as "Total Edits",
    ht.total_edits_2021_now as "Total Edits (2021 - Now)",
    ht.total_contributors_all_time as "Total Contributors",
    ht.total_contributors_2021_now as "Total Contributors (2021 - Now)"
FROM yearly_metrics ym
JOIN hashtag_totals ht
    ON ym."Hashtag" = ht."Hashtag"
ORDER BY year DESC, "Edits" DESC
"""
df = duckdb.sql(query).df()

top_100_contributors = df.groupby("Hashtag")["Total Contributors"].first().nlargest(100)
top_100_contributors_2021_now = df.groupby("Hashtag")["Total Contributors (2021 - Now)"].first().nlargest(100)
top_100_edits = df.groupby("Hashtag")["Total Edits"].first().nlargest(100)
top_100_edits_2021_now = df.groupby("Hashtag")["Total Edits (2021 - Now)"].first().nlargest(100)

table_configs = [
    util.TableConfig(
        title="Top 100 Hashtags by Contributors",
        query_or_df=df[df["Hashtag"].isin(top_100_contributors.index)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Contributors",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Contributors",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Contributors 2021 - Now",
        query_or_df=df[(df["Hashtag"].isin(top_100_contributors_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Contributors",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Contributors (2021 - Now)",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Edits",
        query_or_df=df[df["Hashtag"].isin(top_100_edits.index)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Edits",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Edits",
    ),
    util.TableConfig(
        title="Top 100 Hashtags by Edits 2021 - Now",
        query_or_df=df[(df["Hashtag"].isin(top_100_edits_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Hashtag",
        value_col="Edits",
        center_columns=["Rank", "Hashtag"],
        sum_col="Total Edits (2021 - Now)",
    ),
]

util.show_tables(table_configs)
Rank Hashtag 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 Total Contributors
1 #missingmaps 12,561 28,330 26,727 28,770 35,232 10,913 12,791 9,616 7,798 379 153,041
2 #redcross 7,849 8,209 7,332 19,818 25,801 3,043 688 1,071 291 5 67,885
3 #msf 724 5,128 8,456 4,899 5,714 4,988 6,094 6,319 6,301 292 43,529
4 #opencitieslac 0 0 0 0 9,244 16,658 9,565 125 37 2 32,846
5 #youthmappers 1,601 2,916 4,288 5,469 9,179 4,429 2,870 2,408 2,013 162 31,388
6 #indonesia 1 1,622 2,031 14,365 14,434 371 51 3 5 0 30,601
7 #covid19 0 0 0 21,990 6,691 283 154 72 1 0 28,072
8 #climatechange 0 0 4 8,012 19,957 462 0 160 9 0 27,037
9 #accenture 548 1,870 2,085 4,814 5,614 1,730 2,065 1,792 1,431 2 20,164
10 #bangladesh 3,191 3,011 161 675 7,919 3,644 186 521 50 5 18,628
11 #jpmc 141 1,018 1,256 2,815 4,305 4,828 2,222 2,532 26 5 17,268
12 #tanzaniadevelopmenttrust 1,383 4,409 3,894 3,425 1,157 1,173 1,232 429 15 0 15,881
13 #educategirls 0 0 0 12,563 2,757 177 147 127 0 0 15,543
14 #amm 781 3,299 2,378 4,597 2,630 1,824 1,790 1,175 269 1 15,030
15 #nigeria 1,097 3,496 2,555 2,177 2,608 878 141 26 36 31 12,395
16 #osmph 3 621 2,994 5,091 1,368 472 487 1,392 21 1 11,873
17 #aphub 0 0 0 0 16 0 1,575 5,630 5,151 37 11,442
18 #buildings 478 119 107 2,966 3,795 1,999 1,059 1,686 103 9 11,381
19 #salesforce 57 3,251 1,234 3,726 1,216 665 278 1,836 295 5 11,362
20 #liga-zambia 0 0 0 0 11,124 111 134 0 0 0 11,319
21 #mapgive 3,012 1,804 708 1,178 3,488 533 197 447 2 0 10,916
22 #maproulette 154 473 1,426 1,779 1,564 1,676 2,769 2,997 3,001 539 10,879
23 #msft 1,514 2,100 1,542 2,558 2,586 35 131 661 265 0 10,416
24 #myanmar 3,013 4,336 2,435 418 334 27 22 2 3 1 10,096
25 #covbots 0 0 0 7,617 1,853 100 71 4 0 0 9,346
26 #dronebird 1 659 2,321 1,257 1,656 385 523 361 2,486 58 9,318
27 #türkiyeeq060223 0 0 0 0 0 0 8,719 3 2 0 8,721
28 #tanzania 1,383 2,295 740 2,004 1,066 695 595 435 34 0 8,446
29 #upnoah 36 820 2,991 3,463 1,100 212 0 2 0 0 8,330
30 #omguru 0 0 0 0 0 0 364 6,276 2,173 46 8,318
31 #osmgeoweek 170 297 162 460 3,370 1,873 1,321 1,215 2 0 8,314
32 #furuhashilab 0 31 1,584 1,269 1,695 494 544 364 2,588 67 8,292
33 #galschool 0 154 1,257 6,856 7 5 0 0 0 0 8,123
34 #msftgive 1,251 1,561 1,581 2,496 2,013 44 39 5 13 0 8,114
35 #hotmicrogrants2020 0 0 0 7,676 148 122 77 84 0 0 8,057
36 #mapph 7 595 2,884 3,425 969 191 0 3 0 0 7,813
37 #foodsecurityph 0 595 2,879 3,424 965 192 0 2 0 0 7,797
38 #crisismappersjapan 52 20 2,049 1,167 1,201 258 329 329 2,328 52 7,562
39 #yercizenler 0 554 415 946 247 51 5,379 202 60 0 7,499
40 #rodekruis 2,278 1,787 1,435 1,050 633 224 75 439 67 4 7,471
41 #missingmaps_drk 0 0 254 1,343 3,154 1,966 397 455 218 1 7,373
42 #tt_event 0 0 0 0 0 0 26 2,934 4,754 295 7,313
43 #uniquemappersnetwork 0 0 262 560 3,404 1,801 653 306 693 6 7,112
44 #awsmapathon 0 159 38 675 675 1,500 3,338 1,285 57 4 7,105
45 #uniquemappersteam 20 232 29 465 3,381 1,862 616 304 691 5 7,053
46 #maplibya 0 0 0 41 2,937 1,817 816 1,865 1 0 7,002
47 #lnsdi 0 0 0 0 2,927 1,816 816 1,865 1 0 6,957
48 #mapimpacto 0 0 0 6,667 45 5 0 0 0 0 6,713
49 #ourimpact 0 0 0 1,389 891 1,325 1,835 2,073 294 5 6,621
50 #msgivesback 0 0 0 1,106 2,133 2,275 810 944 13 4 6,388
51 #maptripoli 0 0 0 0 2,881 1,719 702 1,507 1 0 6,364
52 #nigerstate 1,094 2,171 907 1,696 347 99 1 0 0 0 6,076
53 #cartong 137 1,565 786 1,032 1,434 676 386 267 256 19 6,022
54 #lndti 0 0 0 0 1,940 1,816 815 1,865 1 0 6,013
55 #unmappers 0 0 324 471 1,939 1,335 778 343 1,022 111 5,805
56 #covjam 0 0 0 5,670 137 0 0 0 0 0 5,787
57 #hurricanemaria 5,763 4 0 1 0 0 0 0 0 0 5,764
58 #kenya 174 869 3,805 676 220 172 7 24 6 0 5,743
59 #osm_in 0 2,820 2,592 458 5 1 0 0 0 0 5,648
60 #puertorico 5,636 2 0 5 0 0 0 0 0 0 5,641
61 #osmnigeria 0 29 281 557 3,318 1,627 227 1 0 0 5,610
62 #water 11 11 14 15 2,018 1,832 897 1,186 17 2 5,579
63 #disasterresponse 0 0 1 0 309 776 583 3,186 1,188 1 5,553
64 #moroccoearthquake2023 0 0 0 0 0 0 4,727 1,027 4 0 5,510
65 #infrastructure 0 4 0 2 1,943 1,814 872 1,171 4 0 5,404
66 #vegetation 0 0 0 0 1,936 1,813 878 1,171 2 0 5,397
67 #landuselandcover 0 0 0 0 1,937 1,813 871 1,170 1 0 5,390
68 #streetsroads 0 0 0 0 1,936 1,812 872 1,169 1 0 5,388
69 #soil 0 0 0 0 1,935 1,810 872 1,169 1 0 5,386
70 #air 0 1 1 1 1,930 1,801 870 1,169 5 1 5,376
71 #osm 39 63 97 456 2,958 541 209 1,087 89 4 5,321
72 #omhap 0 0 0 0 0 628 4,526 37 13 0 5,092
73 #osmindia 0 0 0 0 1 7 449 3,872 819 27 4,938
74 #uniquemappersteam2021 0 0 0 0 3,404 1,616 226 1 0 0 4,923
75 #hotph 0 0 1,758 2,894 388 12 9 31 0 0 4,875
76 #youthmappersagu 0 0 0 30 1,419 433 420 137 2,471 47 4,829
77 #srilanka 3 2,474 5 12 19 46 2,308 1 4 0 4,829
78 #turkey 0 1 9 5 3 4 4,810 13 1 0 4,828
79 #turkiye 0 0 0 0 1 0 4,813 7 0 0 4,817
80 #withrefugees 628 3,803 413 24 15 3 0 0 0 0 4,781
81 #syria 0 1 1 1 1 0 4,532 164 65 0 4,740
82 #philaware 0 0 1,758 2,852 274 11 8 31 0 0 4,729
83 #colombia 235 2,602 736 602 652 12 1 7 5 0 4,728
84 #victornsunday 0 0 260 556 2,623 1,346 225 1 0 0 4,725
85 #bloomberg 1 0 344 1,866 1,185 501 664 670 85 5 4,622
86 #madagascar 147 305 15 5 2,000 2,323 6 3 2 1 4,619
87 #chad 0 279 2,183 522 188 1,408 215 1 0 0 4,592
88 #drc 2 356 1,427 792 179 79 369 1,391 169 0 4,529
89 #rccolombia 232 2,598 523 565 650 12 0 0 0 0 4,480
90 #nigeriafloods2021 0 0 0 0 2,491 2,043 175 8 0 0 4,467
91 #osmafrica 0 0 1 0 4 2,796 894 846 125 9 4,411
92 #mapuganda 230 2,465 22 579 375 599 152 97 67 4 4,394
93 #osmnepal 7 26 25 390 396 1,187 1,437 1,260 74 79 4,357
94 #womenconnect 0 0 3,157 1,275 6 3 0 0 0 0 4,294
95 #ngdisasterriskreduction 0 0 262 527 2,510 1,004 208 1 0 0 4,273
96 #gomicron21twsg 0 0 0 0 4,248 0 0 0 0 0 4,248
97 #cycloneidai 0 0 4,207 9 3 1 1 0 0 0 4,212
98 #mapforpreparedness 307 104 1,298 1,801 821 3 0 0 0 0 4,211
99 #osmbd 117 82 26 6 48 3,463 270 259 49 1 4,102
100 #mozambique 5 18 2,920 218 87 871 4 6 0 0 4,064

Geographical Distribution of Top 10 Hashtags

code
# Create geographical maps for top 10 hashtags
df = duckdb.sql("""
WITH hashtag_expanded AS (
    SELECT 
        mid_pos_x,
        mid_pos_y,
        edit_count,
        unnest(hashtags) as hashtag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE hashtags IS NOT NULL 
      AND mid_pos_x IS NOT NULL 
      AND mid_pos_y IS NOT NULL
),
top_hashtags AS (
    SELECT hashtag
    FROM (
        SELECT
            hashtag,
            SUM(edit_count) as total_edits
        FROM hashtag_expanded
        GROUP BY hashtag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
hashtag_geo_data AS (
    SELECT 
        he.hashtag,
        he.mid_pos_x as x,
        he.mid_pos_y as y,
        SUM(he.edit_count) as z
    FROM hashtag_expanded he
    WHERE he.hashtag IN (SELECT hashtag FROM top_hashtags)
    GROUP BY he.hashtag, he.mid_pos_x, he.mid_pos_y
)
SELECT * FROM hashtag_geo_data
ORDER BY hashtag, x, y
""").df()

# Create dropdown maps for each hashtag
configs = []
for hashtag in sorted(df["hashtag"].unique()):
    configs.append(
        util.FigureConfig(
            title=f"Edit Distribution - {hashtag}".replace("#", ""),
            x_col="x",
            y_col="y",
            z_col="z",
            query_or_df=df[df["hashtag"] == hashtag],
            plot_type="map",
        )
    )

util.show_figure(configs, type="dropdown")