that's too much!

This commit is contained in:
2024-12-19 20:22:56 -08:00
parent 0020a609dd
commit 32cd60e92b
8443 changed files with 1446950 additions and 42 deletions

View File

@@ -0,0 +1,332 @@
import os
import contextlib
from zipfile import ZipFile, ZIP_DEFLATED
import pytest
import pyogrio
import pyogrio.raw
from pyogrio.util import vsi_path
try:
import geopandas # NOQA
has_geopandas = True
except ImportError:
has_geopandas = False
@contextlib.contextmanager
def change_cwd(path):
curdir = os.getcwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(curdir)
@pytest.mark.parametrize(
"path, expected",
[
# local file paths that should be passed through as is
("data.gpkg", "data.gpkg"),
("/home/user/data.gpkg", "/home/user/data.gpkg"),
(r"C:\User\Documents\data.gpkg", r"C:\User\Documents\data.gpkg"),
("file:///home/user/data.gpkg", "/home/user/data.gpkg"),
# cloud URIs
("https://testing/data.gpkg", "/vsicurl/https://testing/data.gpkg"),
("s3://testing/data.gpkg", "/vsis3/testing/data.gpkg"),
("gs://testing/data.gpkg", "/vsigs/testing/data.gpkg"),
("az://testing/data.gpkg", "/vsiaz/testing/data.gpkg"),
("adl://testing/data.gpkg", "/vsiadls/testing/data.gpkg"),
("adls://testing/data.gpkg", "/vsiadls/testing/data.gpkg"),
("hdfs://testing/data.gpkg", "/vsihdfs/testing/data.gpkg"),
("webhdfs://testing/data.gpkg", "/vsiwebhdfs/testing/data.gpkg"),
# archives
("zip://data.zip", "/vsizip/data.zip"),
("tar://data.tar", "/vsitar/data.tar"),
("gzip://data.gz", "/vsigzip/data.gz"),
("tar://./my.tar!my.geojson", "/vsitar/./my.tar/my.geojson"),
(
"zip://home/data/shapefile.zip!layer.shp",
"/vsizip/home/data/shapefile.zip/layer.shp",
),
# combined schemes
("zip+s3://testing/shapefile.zip", "/vsizip/vsis3/testing/shapefile.zip"),
(
"zip+https://s3.amazonaws.com/testing/shapefile.zip",
"/vsizip/vsicurl/https://s3.amazonaws.com/testing/shapefile.zip",
),
# auto-prefix zip files
("test.zip", "/vsizip/test.zip"),
("/a/b/test.zip", "/vsizip//a/b/test.zip"),
("a/b/test.zip", "/vsizip/a/b/test.zip"),
# archives using ! notation should be prefixed by vsizip
("test.zip!item.shp", "/vsizip/test.zip/item.shp"),
("test.zip!/a/b/item.shp", "/vsizip/test.zip/a/b/item.shp"),
("test.zip!a/b/item.shp", "/vsizip/test.zip/a/b/item.shp"),
("/vsizip/test.zip/a/b/item.shp", "/vsizip/test.zip/a/b/item.shp"),
("zip:///test.zip/a/b/item.shp", "/vsizip//test.zip/a/b/item.shp"),
# auto-prefix remote zip files
(
"https://s3.amazonaws.com/testing/test.zip",
"/vsizip/vsicurl/https://s3.amazonaws.com/testing/test.zip",
),
(
"https://s3.amazonaws.com/testing/test.zip!/a/b/item.shp",
"/vsizip/vsicurl/https://s3.amazonaws.com/testing/test.zip/a/b/item.shp",
),
("s3://testing/test.zip", "/vsizip/vsis3/testing/test.zip"),
(
"s3://testing/test.zip!a/b/item.shp",
"/vsizip/vsis3/testing/test.zip/a/b/item.shp",
),
],
)
def test_vsi_path(path, expected):
assert vsi_path(path) == expected
def test_vsi_path_unknown():
# unrecognized URI gets passed through as is
assert vsi_path("s4://test/data.geojson") == "s4://test/data.geojson"
def test_vsi_handling_read_functions(naturalearth_lowres_vsi):
# test that all different read entry points have the path handling
# (a zip:// path would otherwise fail)
path, _ = naturalearth_lowres_vsi
path = "zip://" + str(path)
result = pyogrio.raw.read(path)
assert len(result[2]) == 177
result = pyogrio.read_info(path)
assert result["features"] == 177
result = pyogrio.read_bounds(path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_vsi_handling_read_dataframe(naturalearth_lowres_vsi):
path, _ = naturalearth_lowres_vsi
path = "zip://" + str(path)
result = pyogrio.read_dataframe(path)
assert len(result) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_path_absolute(data_dir):
# pathlib path
path = data_dir / "naturalearth_lowres/naturalearth_lowres.shp"
df = pyogrio.read_dataframe(path)
assert len(df) == 177
# str path
df = pyogrio.read_dataframe(str(path))
assert len(df) == 177
def test_path_relative(data_dir):
path = "naturalearth_lowres/naturalearth_lowres.shp"
with change_cwd(data_dir):
result = pyogrio.raw.read(path)
assert len(result[2]) == 177
result = pyogrio.read_info(path)
assert result["features"] == 177
result = pyogrio.read_bounds(path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_path_relative_dataframe(data_dir):
with change_cwd(data_dir):
df = pyogrio.read_dataframe("naturalearth_lowres/naturalearth_lowres.shp")
assert len(df) == 177
def test_uri_local_file(data_dir):
path = "file://" + str(data_dir / "naturalearth_lowres/naturalearth_lowres.shp")
result = pyogrio.raw.read(path)
assert len(result[2]) == 177
result = pyogrio.read_info(path)
assert result["features"] == 177
result = pyogrio.read_bounds(path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_uri_local_file_dataframe(data_dir):
uri = "file://" + str(data_dir / "naturalearth_lowres/naturalearth_lowres.shp")
df = pyogrio.read_dataframe(uri)
assert len(df) == 177
def test_zip_path(naturalearth_lowres_vsi):
path, path_vsi = naturalearth_lowres_vsi
path_zip = "zip://" + str(path)
# absolute zip path
result = pyogrio.raw.read(path_zip)
assert len(result[2]) == 177
result = pyogrio.read_info(path_zip)
assert result["features"] == 177
result = pyogrio.read_bounds(path_zip)
assert len(result[0]) == 177
# absolute vsizip path
result = pyogrio.raw.read(path_vsi)
assert len(result[2]) == 177
result = pyogrio.read_info(path_vsi)
assert result["features"] == 177
result = pyogrio.read_bounds(path_vsi)
assert len(result[0]) == 177
# relative zip path
relative_path = "zip://" + path.name
with change_cwd(path.parent):
result = pyogrio.raw.read(relative_path)
assert len(result[2]) == 177
result = pyogrio.read_info(relative_path)
assert result["features"] == 177
result = pyogrio.read_bounds(relative_path)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_zip_path_dataframe(naturalearth_lowres_vsi):
path, path_vsi = naturalearth_lowres_vsi
path_zip = "zip://" + str(path)
# absolute zip path
df = pyogrio.read_dataframe(path_zip)
assert len(df) == 177
# absolute vsizip path
df = pyogrio.read_dataframe(path_vsi)
assert len(df) == 177
# relative zip path
with change_cwd(path.parent):
df = pyogrio.read_dataframe("zip://" + path.name)
assert len(df) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_detect_zip_path(tmp_path, naturalearth_lowres):
# create a zipfile with 2 shapefiles in a set of subdirectories
df = pyogrio.read_dataframe(naturalearth_lowres, where="iso_a3 in ('CAN', 'PER')")
pyogrio.write_dataframe(df.loc[df.iso_a3 == "CAN"], tmp_path / "test1.shp")
pyogrio.write_dataframe(df.loc[df.iso_a3 == "PER"], tmp_path / "test2.shp")
path = tmp_path / "test.zip"
with ZipFile(path, mode="w", compression=ZIP_DEFLATED, compresslevel=5) as out:
for ext in ["dbf", "prj", "shp", "shx"]:
filename = f"test1.{ext}"
out.write(tmp_path / filename, filename)
filename = f"test2.{ext}"
out.write(tmp_path / filename, f"/a/b/{filename}")
# defaults to the first shapefile found, at lowest subdirectory
df = pyogrio.read_dataframe(path)
assert df.iso_a3[0] == "CAN"
# selecting a shapefile from within the zip requires "!"" archive specifier
df = pyogrio.read_dataframe(f"{path}!test1.shp")
assert df.iso_a3[0] == "CAN"
df = pyogrio.read_dataframe(f"{path}!/a/b/test2.shp")
assert df.iso_a3[0] == "PER"
# specifying zip:// scheme should also work
df = pyogrio.read_dataframe(f"zip://{path}!/a/b/test2.shp")
assert df.iso_a3[0] == "PER"
# specifying /vsizip/ should also work but path must already be in GDAL ready
# format without the "!"" archive specifier
df = pyogrio.read_dataframe(f"/vsizip/{path}/a/b/test2.shp")
assert df.iso_a3[0] == "PER"
@pytest.mark.network
def test_url():
url = "https://raw.githubusercontent.com/geopandas/pyogrio/main/pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shp" # NOQA
result = pyogrio.raw.read(url)
assert len(result[2]) == 177
result = pyogrio.read_info(url)
assert result["features"] == 177
result = pyogrio.read_bounds(url)
assert len(result[0]) == 177
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_url_dataframe():
url = "https://raw.githubusercontent.com/geopandas/pyogrio/main/pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shp" # NOQA
assert len(pyogrio.read_dataframe(url)) == 177
@pytest.mark.network
def test_url_with_zip():
url = "zip+https://s3.amazonaws.com/fiona-testing/coutwildrnp.zip"
result = pyogrio.raw.read(url)
assert len(result[2]) == 67
result = pyogrio.read_info(url)
assert result["features"] == 67
result = pyogrio.read_bounds(url)
assert len(result[0]) == 67
@pytest.mark.network
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_url_with_zip_dataframe():
url = "zip+https://s3.amazonaws.com/fiona-testing/coutwildrnp.zip"
df = pyogrio.read_dataframe(url)
assert len(df) == 67
@pytest.fixture
def aws_env_setup(monkeypatch):
monkeypatch.setenv("AWS_NO_SIGN_REQUEST", "YES")
@pytest.mark.network
def test_uri_s3(aws_env_setup):
url = "zip+s3://fiona-testing/coutwildrnp.zip"
result = pyogrio.raw.read(url)
assert len(result[2]) == 67
result = pyogrio.read_info(url)
assert result["features"] == 67
result = pyogrio.read_bounds(url)
assert len(result[0]) == 67
@pytest.mark.network
@pytest.mark.skipif(not has_geopandas, reason="GeoPandas not available")
def test_uri_s3_dataframe(aws_env_setup):
df = pyogrio.read_dataframe("zip+s3://fiona-testing/coutwildrnp.zip")
assert len(df) == 67