""" Script that generates the included dataset 'naturalearth_lowres.shp' and 'naturalearth_cities.shp'. Raw data: https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/110m/cultural/ne_110m_admin_0_countries.zip Current version used: see code """ import geopandas as gpd import requests from pathlib import Path from zipfile import ZipFile import tempfile from shapely.geometry import box version = "latest" urlbase = "https://www.naturalearthdata.com/" urlbase += "http//www.naturalearthdata.com/download/110m/cultural/" def countries_override(world_raw): # not ideal - fix some country codes mask = world_raw["ISO_A3"].eq("-99") & world_raw["TYPE"].isin( ["Sovereign country", "Country"] ) world_raw.loc[mask, "ISO_A3"] = world_raw.loc[mask, "ADM0_A3"] # backwards compatibility return world_raw.rename(columns={"GDP_MD": "GDP_MD_EST"}) # any change between versions? def df_same(new, old, dataset, log): assert (new.columns == old.columns).all(), "columns should be the same" if new.shape != old.shape: dfc = old.merge(new, on="name", how="outer", suffixes=("_old", "_new")).loc[ lambda d: d.isna().any(axis=1) ] log.append(f"### {dataset} row count changed ###\n{dfc.to_markdown()}") return False dfc = new.compare(old) if len(dfc) > 0: log.append(f"### {dataset} data changed ###\n{dfc.to_markdown()}") return len(dfc) == 0 config = [ { "file": "ne_110m_populated_places.zip", "cols": ["NAME", "geometry"], "current": gpd.datasets.get_path("naturalearth_cities"), }, { "file": "ne_110m_admin_0_countries.zip", "cols": ["POP_EST", "CONTINENT", "NAME", "ISO_A3", "GDP_MD_EST", "geometry"], "override": countries_override, "current": gpd.datasets.get_path("naturalearth_lowres"), }, ] downloads = {} log = [] for dl in config: with tempfile.TemporaryDirectory() as tmpdirname: url = urlbase + dl["file"] r = requests.get( url, stream=True, headers={"User-Agent": "XY"}, params=None if version == "latest" else {"version": version}, ) assert ( r.status_code == 200 ), f"version: {version} does not exist. status: {r.status_code}" f = Path(tmpdirname).joinpath(dl["file"]) with open(f, "wb") as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) # extract the natural earth version z = ZipFile(f) version_f = [i for i in z.infolist() if "VERSION" in i.filename] assert len(version_f) == 1, "failed to find VERSION file" with open(z.extract(version_f[0], Path(tmpdirname).joinpath("v.txt"))) as f_: dl_version = f_.read().strip() # extract geodataframe from zip gdf = gpd.read_file(f) # maintain structure that geopandas distributes if "override" in dl.keys(): gdf = dl["override"](gdf) gdf = gdf.loc[:, dl["cols"]] gdf = gdf.rename(columns={c: c.lower() for c in gdf.columns}) # override Crimea #2382 if dl["file"] == "ne_110m_admin_0_countries.zip": crimean_bbox = box(32.274, 44.139, 36.65, 46.704) crimea_only = ( gdf.loc[gdf.name == "Russia", "geometry"] .iloc[0] .intersection(crimean_bbox) ) complete_ukraine = ( gdf.loc[gdf.name == "Ukraine", "geometry"].iloc[0].union(crimea_only) ) correct_russia = ( gdf.loc[gdf.name == "Russia", "geometry"] .iloc[0] .difference(crimean_bbox) ) r_ix = gdf.loc[gdf.name == "Russia"].index[0] gdf.at[r_ix, "geometry"] = correct_russia u_ix = gdf.loc[gdf.name == "Ukraine"].index[0] gdf.at[u_ix, "geometry"] = complete_ukraine # get changes between current version and new version if not df_same(gdf, gpd.read_file(dl["current"]), dl["file"], log): downloads[dl["file"]] = gdf # create change log that can be pasted into PR with open(f"CHANGE_{dl_version}.md", "w") as f: f.write("\n\n".join(log)) # save downloaded geodataframe to appropriate place for k, gdf_ in downloads.items(): f = [Path(c["current"]) for c in config if c["file"] == k][0] gdf_.to_file(driver="ESRI Shapefile", filename=Path(f.parent.name).joinpath(f.name))