Source Tag Analysis

Analysis of OpenStreetMap changesets that use source tags to indicate data sources.

Monthly Percentage of Edits and Contributors Using Source Tags

code
df = duckdb.sql("""
WITH monthly_with_source AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as contributors_with_source,
        SUM(edit_count) as edits_with_source
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE source IS NOT NULL
    GROUP BY year, month
),
monthly_total AS (
    SELECT 
        year,
        month,
        CONCAT(year, '-', LPAD(CAST(month as VARCHAR), 2, '0')) as months,
        COUNT(DISTINCT user_name) as total_contributors,
        SUM(edit_count) as total_edits
    FROM '../changeset_data/year=*/month=*/*.parquet'
    GROUP BY year, month
)
SELECT 
    mt.months,
    COALESCE(ROUND((mws.contributors_with_source * 100.0) / mt.total_contributors, 2), 0) as 'Percentage Contributors with Source',
    COALESCE(ROUND((mws.edits_with_source * 100.0) / mt.total_edits, 2), 0) as 'Percentage Edits with Source'
FROM monthly_total mt
LEFT JOIN monthly_with_source mws ON mt.year = mws.year AND mt.month = mws.month
ORDER BY mt.year, mt.month
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Percentage of Contributors Using Source Tags",
            label="Contributors",
            x_col="months",
            y_col="Percentage Contributors with Source",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Percentage of Edits Using Source Tags",
            label="Edits",
            x_col="months",
            y_col="Percentage Edits with Source",
            y_unit_hover_template="%",
            query_or_df=df,
        ),
    ]
)

Monthly Top 10 Sources: Edits, Accumulated Edits, Contributors and Accumulated Contributors

code
# Get top 10 sources by total edits
df = duckdb.sql("""
WITH source_expanded AS (
    SELECT 
        year,
        month,
        user_name,
        edit_count,
        unnest(source) as source_tag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE source IS NOT NULL
),
top_sources AS (
    SELECT source_tag
    FROM (
        SELECT
            source_tag,
            SUM(edit_count) as total_edits
        FROM source_expanded
        GROUP BY source_tag
        ORDER BY total_edits DESC
        LIMIT 10
    )
),
monthly_source_data AS (
    SELECT 
        se.year,
        se.month,
        CONCAT(se.year, '-', LPAD(CAST(se.month as VARCHAR), 2, '0')) as months,
        se.source_tag,
        COUNT(DISTINCT se.user_name) as "Contributors",
        SUM(se.edit_count) as "Edits"
    FROM source_expanded se
    WHERE se.source_tag IN (SELECT source_tag FROM top_sources)
    GROUP BY se.year, se.month, se.source_tag
)
SELECT 
    months,
    source_tag,
    "Contributors",
    "Edits",
    SUM("Contributors") OVER (PARTITION BY source_tag ORDER BY year, month) as "Contributors Accumulated",
    SUM("Edits") OVER (PARTITION BY source_tag ORDER BY year, month) as "Edits Accumulated"
FROM monthly_source_data
ORDER BY year, month, source_tag
""").df()

util.show_figure(
    [
        util.FigureConfig(
            title="Monthly Edits by Top 10 Source Tags",
            label="Edits",
            x_col="months",
            y_col="Edits",
            group_col="source_tag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Edits by Top 10 Source Tags",
            label="Edits Accumulated",
            x_col="months",
            y_col="Edits Accumulated",
            group_col="source_tag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Monthly Contributors by Top 10 Source Tags",
            label="Contributors",
            x_col="months",
            y_col="Contributors",
            group_col="source_tag",
            query_or_df=df,
        ),
        util.FigureConfig(
            title="Accumulated Contributors by Top 10 Source Tags",
            label="Contributors Accumulated",
            x_col="months",
            y_col="Contributors Accumulated",
            group_col="source_tag",
            query_or_df=df,
        ),
    ]
)

Top 100 Sources Yearly

code
import json

# Load replacement rules for clickable links
with open("../config/replace_rules_imagery_and_source.json") as f:
    source_name_to_html_link = {
        name: f'<a href="{item["link"]}">{name}</a>' for name, item in json.load(f).items() if "link" in item
    }

query = """
WITH source_expanded AS (
    SELECT 
        year,
        user_name,
        edit_count,
        unnest(source) as source_tag
    FROM '../changeset_data/year=*/month=*/*.parquet'
    WHERE source IS NOT NULL
),
user_first_year AS (
    SELECT 
        user_name,
        source_tag,
        MIN(year) as first_year
    FROM source_expanded
    GROUP BY user_name, source_tag
),
source_totals AS (
    SELECT
        source_tag as "Source",
        CAST(SUM(edit_count) as BIGINT) as total_edits_all_time,
        CAST(SUM(CASE WHEN year >= 2021 THEN edit_count ELSE 0 END) as BIGINT) as total_edits_2021_now,
        CAST(COUNT(DISTINCT user_name) as BIGINT) as total_contributors_all_time,
        CAST(COUNT(DISTINCT CASE WHEN year >= 2021 THEN user_name END) as BIGINT) as total_contributors_2021_now
    FROM source_expanded
    GROUP BY source_tag
),
yearly_metrics AS (
    SELECT
        se.year,
        se.source_tag as "Source",
        CAST(SUM(se.edit_count) as BIGINT) as "Edits",
        CAST(COUNT(DISTINCT se.user_name) as BIGINT) as "Contributors",
        CAST(COUNT(DISTINCT CASE WHEN ufy.first_year = se.year THEN se.user_name END) as BIGINT) as "New Contributors"
    FROM source_expanded se
    LEFT JOIN user_first_year ufy 
        ON se.user_name = ufy.user_name AND se.source_tag = ufy.source_tag
    GROUP BY se.year, se.source_tag
)
SELECT 
    ym.year,
    ym."Source",
    ym."Edits",
    ym."New Contributors",
    ym."Contributors",
    st.total_edits_all_time as "Total Edits",
    st.total_edits_2021_now as "Total Edits (2021 - Now)",
    st.total_contributors_all_time as "Total Contributors",
    st.total_contributors_2021_now as "Total Contributors (2021 - Now)"
FROM yearly_metrics ym
JOIN source_totals st
    ON ym."Source" = st."Source"
ORDER BY year DESC, "Edits" DESC
"""
df = duckdb.sql(query).df()

# Apply HTML links to source names
df["Source"] = df["Source"].apply(
    lambda name: source_name_to_html_link[name] if name in source_name_to_html_link else name
)

top_100_contributors = df.groupby("Source")["Total Contributors"].first().nlargest(100)
top_100_contributors_2021_now = df.groupby("Source")["Total Contributors (2021 - Now)"].first().nlargest(100)
top_100_edits = df.groupby("Source")["Total Edits"].first().nlargest(100)
top_100_edits_2021_now = df.groupby("Source")["Total Edits (2021 - Now)"].first().nlargest(100)

table_configs = [
    util.TableConfig(
        title="Top 100 Sources by Contributors",
        query_or_df=df[df["Source"].isin(top_100_contributors.index)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Contributors",
        center_columns=["Rank", "Source"],
        sum_col="Total Contributors",
    ),
    util.TableConfig(
        title="Top 100 Sources by Contributors 2021 - Now",
        query_or_df=df[(df["Source"].isin(top_100_contributors_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Contributors",
        center_columns=["Rank", "Source"],
        sum_col="Total Contributors (2021 - Now)",
    ),
    util.TableConfig(
        title="Top 100 Sources by Edits",
        query_or_df=df[df["Source"].isin(top_100_edits.index)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Edits",
        center_columns=["Rank", "Source"],
        sum_col="Total Edits",
    ),
    util.TableConfig(
        title="Top 100 Sources by Edits 2021 - Now",
        query_or_df=df[(df["Source"].isin(top_100_edits_2021_now.index)) & (df["year"] >= 2021)],
        x_axis_col="year",
        y_axis_col="Source",
        value_col="Edits",
        center_columns=["Rank", "Source"],
        sum_col="Total Edits (2021 - Now)",
    ),
]

util.show_tables(table_configs)
Rank Source 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 Total Contributors
1 Local Knowledge 15 39 96 124 299 1,112 1,185 1,155 6,004 13,364 14,866 16,846 16,774 16,676 24,925 73,065 84,639 11,213 210,691
2 Survey 50 151 266 332 936 6,016 6,994 6,815 16,397 17,965 18,067 21,616 31,851 35,699 43,215 64,838 74,319 14,528 191,520
3 Bing Aerial Imagery 0 250 543 628 1,521 7,987 9,704 9,369 11,150 13,935 14,177 14,847 14,617 14,549 17,520 33,157 35,519 6,544 120,186
4 GPS 24 73 142 158 379 2,086 1,968 1,784 2,756 4,071 4,474 5,516 5,437 4,636 5,351 11,780 13,978 1,633 46,253
5 Streetlevel Imagery 0 0 0 0 0 0 0 0 552 1,525 3,804 5,611 5,473 4,915 6,128 12,539 15,260 2,387 44,593
6 Knowledge 5 36 63 95 150 5,231 5,692 5,528 4,952 4,737 4,846 4,643 4,101 4,110 3,955 3,988 4,041 711 29,718
7 Esri World Imagery 0 0 0 0 0 0 8 3 884 2,528 3,875 5,311 5,061 4,733 5,468 5,750 6,066 1,576 20,463
8 Maxar Imagery 0 0 0 0 0 0 0 0 0 0 4,099 5,927 6,691 6,777 4,527 886 528 28 16,717
9 Mapillary 0 0 0 0 0 76 294 452 733 832 2,899 4,476 4,421 3,594 3,428 3,404 3,756 968 16,388
10 Mapbox Satellite 0 0 0 0 5 589 1,309 1,513 1,669 1,651 1,943 2,692 2,740 2,399 2,376 2,231 2,603 783 13,540
11 OSM Notes 0 0 0 0 11 41 28 41 38 42 35 32 32 1,450 2,055 4,810 5,816 778 12,863
12 mapwithai 0 0 0 0 0 0 0 0 0 0 17 220 1,310 1,773 6,674 2,268 2,093 329 11,228
13 streetside 0 0 0 0 0 0 0 0 0 2 1,450 2,261 2,121 1,425 1,646 1,639 2,020 494 8,724
14 DigitalGlobe Imagery (now Maxar) 0 4 2 11 61 219 189 164 2,250 3,276 3,193 827 604 30 33 41 19 3 7,019
15 tms[]:https:// 0 0 0 0 0 36 635 1,236 2,079 1,365 1,224 1,408 979 574 271 170 65 45 6,790
16 OSM Note 0 1 1 3 44 307 318 342 368 438 473 1,394 1,755 1,799 1,753 1,586 1,586 417 5,472
17 OpenStreetMap Carto (Standard) 0 0 0 0 0 0 0 14 129 184 331 952 1,151 1,336 1,408 1,279 1,284 222 5,279
18 OpenStreetCam Images 0 0 0 0 0 0 0 7 101 135 1,470 2,239 2,011 301 9 1 3 0 4,574
19 Strava 0 0 0 0 0 78 198 298 335 342 311 957 1,230 945 786 815 749 182 3,544
20 JOSM Validator 1 2 1 1 3 17 20 47 46 45 47 3,175 417 166 86 43 23 7 3,500
21 website 0 2 6 8 28 190 228 242 312 468 534 596 619 648 600 728 865 157 3,315
22 kartaview 0 0 0 0 0 0 0 0 0 0 0 1 50 911 1,022 939 1,031 171 2,990
23 Osmose 0 0 0 2 9 230 310 347 381 376 484 602 546 464 489 471 478 110 2,662
24 wiki.openstreetmap.org 2 3 5 6 22 135 124 135 153 212 280 307 358 371 315 398 453 79 2,060
25 Wikipedia 0 4 6 16 34 215 177 211 252 207 230 248 244 263 216 252 282 45 1,897
26 BDOrtho 0 0 0 0 0 0 0 127 253 300 332 521 534 584 541 534 498 131 1,829
27 wikipedia 3 5 14 16 34 233 199 199 212 209 227 250 225 213 178 226 206 27 1,662
28 Website 0 0 0 1 4 53 42 64 98 102 121 153 189 255 290 463 454 92 1,588
29 Geotagged Images 0 0 0 0 2 33 20 33 37 45 58 317 425 476 487 468 464 83 1,570
30 mapilio 0 0 0 0 0 0 0 0 0 0 0 0 0 0 308 694 837 139 1,562
31 Cadastre 0 1 11 13 32 183 198 205 227 216 243 319 333 377 374 393 354 72 1,540
32 panoramax 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 413 1,148 240 1,448
33 aerial_imagery 0 0 0 0 1 0 0 1 1,221 8 11 28 20 13 13 25 66 7 1,347
34 Geoportal 2: Orthophotomap 0 0 0 0 1 96 166 144 176 144 189 288 365 359 368 354 351 140 1,249
35 KartaView 0 0 0 0 0 0 0 0 0 0 0 1 532 514 352 270 280 64 1,210
36 cadastre 0 3 9 13 37 212 189 199 213 222 230 246 227 207 192 192 191 36 1,179
37 basemap.at 0 0 0 0 26 121 149 192 163 164 151 208 246 362 296 255 216 59 1,137
38 gpx 1 7 8 15 16 131 157 168 169 153 164 188 152 139 129 148 141 24 1,114
39 osmose 0 0 1 2 11 98 139 115 124 132 177 196 179 173 159 140 136 22 1,045
40 .gpx data file 0 2 3 4 14 101 106 101 105 141 133 169 143 120 114 98 102 11 1,008
41 OSM 0 0 1 2 14 95 105 101 130 112 147 143 123 110 110 100 123 21 892
42 GPX 1 2 4 13 13 97 117 118 112 112 116 152 122 91 100 97 106 11 889
43 microsoft/BuildingFootprints 0 0 0 0 0 0 0 0 0 0 0 106 88 204 322 288 208 40 868
44 PNOA Spain 0 0 1 0 1 13 22 19 32 66 96 153 215 229 250 229 234 68 863
45 wiki 0 0 1 2 3 80 87 80 102 93 109 122 122 137 107 125 140 19 794
46 youtube 0 0 0 0 0 6 13 25 31 39 70 110 146 138 172 213 238 39 791
47 Google Maps 0 0 0 0 3 17 9 9 15 43 45 58 96 63 84 184 182 3 771
48 Validator 0 0 0 2 1 33 14 8 11 10 7 664 38 20 14 9 6 3 765
49 Japan GSI 0 0 0 0 0 0 3 16 75 142 168 147 149 147 154 180 183 62 748
50 Bavaria (80 cm) 0 0 0 0 0 0 6 11 25 66 116 214 273 266 263 164 2 1 701
51 wikidata 0 0 0 0 0 18 18 41 51 54 80 95 107 141 128 152 169 23 676
52 PDOK aerial imagery 0 0 0 0 0 0 0 0 33 71 84 137 158 213 204 197 207 42 675
53 MapBox 0 0 0 0 0 345 253 159 103 47 44 18 13 15 15 12 9 4 672
54 North Rhine-Westphalia aerial photograph 0 0 0 0 0 0 0 0 0 0 0 133 226 251 233 229 232 53 663
55 local survey 0 0 4 4 8 59 66 71 83 91 118 121 125 133 97 97 108 22 638
56 White) 0 0 0 0 0 12 15 24 21 23 56 257 261 38 0 0 0 0 631
57 OSM Data 0 0 0 0 0 4 5 5 8 12 17 161 307 248 121 54 43 12 604
58 OpenStreetMap (Standard Black 0 0 0 0 0 0 0 8 20 23 56 257 263 37 0 0 0 0 592
59 Wikidata 0 0 0 0 0 16 11 56 42 47 60 76 89 114 100 110 145 36 586
60 MapBox Satellite 0 0 0 0 0 413 176 54 31 16 18 8 5 5 6 3 3 0 575
61 extrapolation 3 11 9 15 18 49 41 36 46 82 72 74 71 81 73 109 109 20 552
62 Ortskenntnis 0 0 0 0 21 118 93 91 89 111 108 101 111 80 89 67 72 15 551
63 websites 0 0 3 2 7 47 43 44 54 58 82 104 100 78 86 105 79 11 531
64 Streetside 0 0 0 0 0 0 0 0 1 3 8 35 24 21 38 258 270 59 526
65 OSM Inspector 0 0 0 2 8 79 55 82 113 78 78 80 89 75 52 61 46 16 518
66 osm 0 0 0 1 7 63 61 61 69 77 92 77 79 61 63 49 55 9 516
67 www.openstreetmap.org/changeset/ 0 0 0 0 0 1 20 33 37 64 87 98 121 128 124 133 140 26 515
68 PNOA 1 3 13 9 10 76 66 95 91 96 123 115 97 91 92 78 67 23 515
69 local 0 0 1 1 11 34 45 44 36 63 73 65 80 80 44 44 60 7 508
70 web 0 0 1 1 6 56 51 47 50 75 63 81 68 75 77 89 93 15 504
71 google maps 0 0 1 0 1 6 6 6 23 40 39 59 65 31 57 91 88 2 502
72 personal knowledge 0 2 4 2 18 67 58 62 50 60 60 63 71 46 44 61 68 5 496
73 Internet 0 0 0 0 2 57 47 50 41 57 54 56 72 76 65 92 96 14 490
74 swisstopo SWISSIMAGE 0 0 0 0 0 0 0 0 0 0 0 0 155 184 186 186 194 49 483
75 vor Ort 0 0 0 0 11 50 51 52 67 71 95 90 97 97 71 67 71 19 475
76 BANO 0 0 0 0 0 56 64 67 75 58 61 96 121 141 157 108 80 8 473
77 cadastre-dgi-fr 4 27 27 33 41 112 100 82 75 75 63 80 64 64 47 44 36 9 471
78 National Agriculture Imagery Program 0 0 0 0 0 0 0 0 0 0 21 85 106 98 110 153 122 12 470
79 observation 0 2 6 3 15 80 60 74 61 47 58 60 65 66 52 41 42 7 455
80 Norway Orthophoto 0 0 0 0 0 0 0 0 0 40 60 82 95 155 112 115 112 25 423
81 Catastro Spain 0 0 0 1 0 7 24 25 28 56 72 90 75 69 82 98 95 25 417
82 IGN 0 0 0 0 0 4 7 23 41 48 55 58 78 70 76 87 81 12 417
83 internet 0 0 0 2 1 39 33 40 38 47 68 65 51 60 51 73 61 14 414
84 Google 0 3 0 0 1 19 9 24 19 27 18 29 61 36 45 78 60 2 412
85 OpenTopoMap 0 0 0 0 0 0 0 0 8 22 35 94 119 109 118 103 89 16 412
86 {switch:a,b,c,d}.tiles.mapbox.com 0 0 0 0 0 0 1 0 37 59 212 92 37 3 0 0 0 0 409
87 Ortho HR 0 0 0 0 0 0 0 0 0 1 54 107 216 195 27 8 5 1 405
88 MAPNIK 0 0 0 0 1 197 155 47 27 5 3 22 10 9 8 7 5 0 404
89 JOSM 0 1 2 1 3 51 47 60 60 59 60 66 56 49 42 21 23 4 401
90 - 0 0 0 0 8 49 35 53 49 63 54 44 46 49 50 50 46 12 397
91 Maps4BW 0 0 0 0 24 99 101 102 95 107 98 89 96 82 35 6 4 0 396
92 1.19) 0 0 0 0 0 0 0 0 0 1 3 124 322 16 5 6 1 1 394
93 1.79) 0 0 0 0 0 0 0 0 1 1 0 124 319 18 4 8 1 0 393
94 MML Orthophoto 0 0 0 0 0 0 0 0 0 17 27 57 64 126 154 128 82 23 388
95 imagery 0 1 1 2 3 17 15 27 32 51 66 69 54 42 40 55 74 14 388
96 Ortho HR IGN 0 0 0 0 0 0 0 0 0 0 0 0 7 26 226 164 136 20 383
97 Luftbild 0 0 1 1 5 22 30 25 42 49 63 62 56 54 54 63 59 17 382
98 geoportal 0 0 0 0 3 83 62 69 51 50 66 64 71 49 52 46 48 12 381
99 Vor Ort 0 0 0 1 1 24 22 14 28 31 45 42 54 65 100 90 81 13 380
100 visit 0 3 0 1 5 41 33 44 41 63 59 45 49 53 55 38 51 5 379