del env py
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,614 +0,0 @@
|
||||
import json
|
||||
from packaging.version import Version
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from numpy.typing import NDArray
|
||||
|
||||
import shapely
|
||||
from shapely import GeometryType
|
||||
|
||||
from geopandas import GeoDataFrame
|
||||
from geopandas._compat import SHAPELY_GE_204
|
||||
from geopandas.array import from_shapely, from_wkb
|
||||
|
||||
GEOARROW_ENCODINGS = [
|
||||
"point",
|
||||
"linestring",
|
||||
"polygon",
|
||||
"multipoint",
|
||||
"multilinestring",
|
||||
"multipolygon",
|
||||
]
|
||||
|
||||
|
||||
## GeoPandas -> GeoArrow
|
||||
|
||||
|
||||
class ArrowTable:
|
||||
"""
|
||||
Wrapper class for Arrow data.
|
||||
|
||||
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
|
||||
``__arrow_c_stream__`` method). This object can then be consumed by
|
||||
your Arrow implementation of choice that supports this protocol.
|
||||
|
||||
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> import pyarrow as pa
|
||||
>>> pa.table(gdf.to_arrow()) # doctest: +SKIP
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, pa_table):
|
||||
self._pa_table = pa_table
|
||||
|
||||
def __arrow_c_stream__(self, requested_schema=None):
|
||||
return self._pa_table.__arrow_c_stream__(requested_schema=requested_schema)
|
||||
|
||||
|
||||
class GeoArrowArray:
|
||||
"""
|
||||
Wrapper class for a geometry array as Arrow data.
|
||||
|
||||
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
|
||||
``__arrow_c_array/stream__`` method). This object can then be consumed by
|
||||
your Arrow implementation of choice that supports this protocol.
|
||||
|
||||
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> import pyarrow as pa
|
||||
>>> pa.array(ser.to_arrow()) # doctest: +SKIP
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, pa_field, pa_array):
|
||||
self._pa_array = pa_array
|
||||
self._pa_field = pa_field
|
||||
|
||||
def __arrow_c_array__(self, requested_schema=None):
|
||||
if requested_schema is not None:
|
||||
raise NotImplementedError(
|
||||
"Requested schema is not supported for geometry arrays"
|
||||
)
|
||||
return (
|
||||
self._pa_field.__arrow_c_schema__(),
|
||||
self._pa_array.__arrow_c_array__()[1],
|
||||
)
|
||||
|
||||
|
||||
def geopandas_to_arrow(
|
||||
df,
|
||||
index=None,
|
||||
geometry_encoding="WKB",
|
||||
interleaved=True,
|
||||
include_z=None,
|
||||
):
|
||||
"""
|
||||
Convert GeoDataFrame to a pyarrow.Table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame
|
||||
The GeoDataFrame to convert.
|
||||
index : bool, default None
|
||||
If ``True``, always include the dataframe's index(es) as columns
|
||||
in the file output.
|
||||
If ``False``, the index(es) will not be written to the file.
|
||||
If ``None``, the index(ex) will be included as columns in the file
|
||||
output except `RangeIndex` which is stored as metadata only.
|
||||
geometry_encoding : {'WKB', 'geoarrow' }, default 'WKB'
|
||||
The GeoArrow encoding to use for the data conversion.
|
||||
interleaved : bool, default True
|
||||
Only relevant for 'geoarrow' encoding. If True, the geometries'
|
||||
coordinates are interleaved in a single fixed size list array.
|
||||
If False, the coordinates are stored as separate arrays in a
|
||||
struct type.
|
||||
include_z : bool, default None
|
||||
Only relevant for 'geoarrow' encoding (for WKB, the dimensionality
|
||||
of the individial geometries is preserved).
|
||||
If False, return 2D geometries. If True, include the third dimension
|
||||
in the output (if a geometry has no third dimension, the z-coordinates
|
||||
will be NaN). By default, will infer the dimensionality from the
|
||||
input geometries. Note that this inference can be unreliable with
|
||||
empty geometries (for a guaranteed result, it is recommended to
|
||||
specify the keyword).
|
||||
|
||||
"""
|
||||
mask = df.dtypes == "geometry"
|
||||
geometry_columns = df.columns[mask]
|
||||
geometry_indices = np.asarray(mask).nonzero()[0]
|
||||
|
||||
df_attr = pd.DataFrame(df.copy(deep=False))
|
||||
|
||||
# replace geometry columns with dummy values -> will get converted to
|
||||
# Arrow null column (not holding any memory), so we can afterwards
|
||||
# fill the resulting table with the correct geometry fields
|
||||
for col in geometry_columns:
|
||||
df_attr[col] = None
|
||||
|
||||
table = pa.Table.from_pandas(df_attr, preserve_index=index)
|
||||
|
||||
geometry_encoding_dict = {}
|
||||
|
||||
if geometry_encoding.lower() == "geoarrow":
|
||||
if Version(pa.__version__) < Version("10.0.0"):
|
||||
raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
|
||||
|
||||
# Encode all geometry columns to GeoArrow
|
||||
for i, col in zip(geometry_indices, geometry_columns):
|
||||
field, geom_arr = construct_geometry_array(
|
||||
np.array(df[col].array),
|
||||
include_z=include_z,
|
||||
field_name=col,
|
||||
crs=df[col].crs,
|
||||
interleaved=interleaved,
|
||||
)
|
||||
table = table.set_column(i, field, geom_arr)
|
||||
geometry_encoding_dict[col] = (
|
||||
field.metadata[b"ARROW:extension:name"]
|
||||
.decode()
|
||||
.removeprefix("geoarrow.")
|
||||
)
|
||||
|
||||
elif geometry_encoding.lower() == "wkb":
|
||||
# Encode all geometry columns to WKB
|
||||
for i, col in zip(geometry_indices, geometry_columns):
|
||||
field, wkb_arr = construct_wkb_array(
|
||||
np.asarray(df[col].array), field_name=col, crs=df[col].crs
|
||||
)
|
||||
table = table.set_column(i, field, wkb_arr)
|
||||
geometry_encoding_dict[col] = "WKB"
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
|
||||
)
|
||||
return table, geometry_encoding_dict
|
||||
|
||||
|
||||
def construct_wkb_array(
|
||||
shapely_arr: NDArray[np.object_],
|
||||
*,
|
||||
field_name: str = "geometry",
|
||||
crs: Optional[str] = None,
|
||||
) -> Tuple[pa.Field, pa.Array]:
|
||||
|
||||
if shapely.geos_version > (3, 10, 0):
|
||||
kwargs = {"flavor": "iso"}
|
||||
else:
|
||||
if shapely.has_z(shapely_arr).any():
|
||||
raise ValueError("Cannot write 3D geometries with GEOS<3.10")
|
||||
kwargs = {}
|
||||
|
||||
wkb_arr = shapely.to_wkb(shapely_arr, **kwargs)
|
||||
extension_metadata = {"ARROW:extension:name": "geoarrow.wkb"}
|
||||
if crs is not None:
|
||||
extension_metadata["ARROW:extension:metadata"] = json.dumps(
|
||||
{"crs": crs.to_json()}
|
||||
)
|
||||
else:
|
||||
# In theory this should not be needed, but otherwise pyarrow < 17
|
||||
# crashes on receiving such data through C Data Interface
|
||||
# https://github.com/apache/arrow/issues/41741
|
||||
extension_metadata["ARROW:extension:metadata"] = "{}"
|
||||
|
||||
field = pa.field(
|
||||
field_name, type=pa.binary(), nullable=True, metadata=extension_metadata
|
||||
)
|
||||
parr = pa.array(np.asarray(wkb_arr), pa.binary())
|
||||
return field, parr
|
||||
|
||||
|
||||
def _convert_inner_coords(coords, interleaved, dims, mask=None):
|
||||
if interleaved:
|
||||
coords_field = pa.field(dims, pa.float64(), nullable=False)
|
||||
typ = pa.list_(coords_field, len(dims))
|
||||
if mask is None:
|
||||
# mask keyword only added in pyarrow 15.0.0
|
||||
parr = pa.FixedSizeListArray.from_arrays(coords.ravel(), type=typ)
|
||||
else:
|
||||
parr = pa.FixedSizeListArray.from_arrays(
|
||||
coords.ravel(), type=typ, mask=mask
|
||||
)
|
||||
else:
|
||||
if dims == "xy":
|
||||
fields = [
|
||||
pa.field("x", pa.float64(), nullable=False),
|
||||
pa.field("y", pa.float64(), nullable=False),
|
||||
]
|
||||
parr = pa.StructArray.from_arrays(
|
||||
[coords[:, 0].copy(), coords[:, 1].copy()], fields=fields, mask=mask
|
||||
)
|
||||
else:
|
||||
fields = [
|
||||
pa.field("x", pa.float64(), nullable=False),
|
||||
pa.field("y", pa.float64(), nullable=False),
|
||||
pa.field("z", pa.float64(), nullable=False),
|
||||
]
|
||||
parr = pa.StructArray.from_arrays(
|
||||
[coords[:, 0].copy(), coords[:, 1].copy(), coords[:, 2].copy()],
|
||||
fields=fields,
|
||||
mask=mask,
|
||||
)
|
||||
return parr
|
||||
|
||||
|
||||
def _linestring_type(point_type):
|
||||
return pa.list_(pa.field("vertices", point_type, nullable=False))
|
||||
|
||||
|
||||
def _polygon_type(point_type):
|
||||
return pa.list_(
|
||||
pa.field(
|
||||
"rings",
|
||||
pa.list_(pa.field("vertices", point_type, nullable=False)),
|
||||
nullable=False,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _multipoint_type(point_type):
|
||||
return pa.list_(pa.field("points", point_type, nullable=False))
|
||||
|
||||
|
||||
def _multilinestring_type(point_type):
|
||||
return pa.list_(
|
||||
pa.field("linestrings", _linestring_type(point_type), nullable=False)
|
||||
)
|
||||
|
||||
|
||||
def _multipolygon_type(point_type):
|
||||
return pa.list_(pa.field("polygons", _polygon_type(point_type), nullable=False))
|
||||
|
||||
|
||||
def construct_geometry_array(
|
||||
shapely_arr: NDArray[np.object_],
|
||||
include_z: Optional[bool] = None,
|
||||
*,
|
||||
field_name: str = "geometry",
|
||||
crs: Optional[str] = None,
|
||||
interleaved: bool = True,
|
||||
) -> Tuple[pa.Field, pa.Array]:
|
||||
# NOTE: this implementation returns a (field, array) pair so that it can set the
|
||||
# extension metadata on the field without instantiating extension types into the
|
||||
# global pyarrow registry
|
||||
geom_type, coords, offsets = shapely.to_ragged_array(
|
||||
shapely_arr, include_z=include_z
|
||||
)
|
||||
|
||||
mask = shapely.is_missing(shapely_arr)
|
||||
if mask.any():
|
||||
if (
|
||||
geom_type == GeometryType.POINT
|
||||
and interleaved
|
||||
and Version(pa.__version__) < Version("15.0.0")
|
||||
):
|
||||
raise ValueError(
|
||||
"Converting point geometries with missing values is not supported "
|
||||
"for interleaved coordinates with pyarrow < 15.0.0. Please "
|
||||
"upgrade to a newer version of pyarrow."
|
||||
)
|
||||
mask = pa.array(mask, type=pa.bool_())
|
||||
|
||||
if geom_type == GeometryType.POINT and not SHAPELY_GE_204:
|
||||
# bug in shapely < 2.0.4, see https://github.com/shapely/shapely/pull/2034
|
||||
# this workaround only works if there are no empty points
|
||||
indices = np.nonzero(mask)[0]
|
||||
indices = indices - np.arange(len(indices))
|
||||
coords = np.insert(coords, indices, np.nan, axis=0)
|
||||
|
||||
else:
|
||||
mask = None
|
||||
|
||||
if coords.shape[-1] == 2:
|
||||
dims = "xy"
|
||||
elif coords.shape[-1] == 3:
|
||||
dims = "xyz"
|
||||
else:
|
||||
raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
|
||||
|
||||
extension_metadata: Dict[str, str] = {}
|
||||
if crs is not None:
|
||||
extension_metadata["ARROW:extension:metadata"] = json.dumps(
|
||||
{"crs": crs.to_json()}
|
||||
)
|
||||
else:
|
||||
# In theory this should not be needed, but otherwise pyarrow < 17
|
||||
# crashes on receiving such data through C Data Interface
|
||||
# https://github.com/apache/arrow/issues/41741
|
||||
extension_metadata["ARROW:extension:metadata"] = "{}"
|
||||
|
||||
if geom_type == GeometryType.POINT:
|
||||
parr = _convert_inner_coords(coords, interleaved, dims, mask=mask)
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.point"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.LINESTRING:
|
||||
assert len(offsets) == 1, "Expected one offsets array"
|
||||
(geom_offsets,) = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
parr = pa.ListArray.from_arrays(
|
||||
pa.array(geom_offsets), _parr, _linestring_type(_parr.type), mask=mask
|
||||
)
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.POLYGON:
|
||||
assert len(offsets) == 2, "Expected two offsets arrays"
|
||||
ring_offsets, geom_offsets = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
|
||||
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
|
||||
parr = parr.cast(_polygon_type(_parr.type))
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.MULTIPOINT:
|
||||
assert len(offsets) == 1, "Expected one offsets array"
|
||||
(geom_offsets,) = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
parr = pa.ListArray.from_arrays(
|
||||
pa.array(geom_offsets), _parr, type=_multipoint_type(_parr.type), mask=mask
|
||||
)
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.MULTILINESTRING:
|
||||
assert len(offsets) == 2, "Expected two offsets arrays"
|
||||
ring_offsets, geom_offsets = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
|
||||
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
|
||||
parr = parr.cast(_multilinestring_type(_parr.type))
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.MULTIPOLYGON:
|
||||
assert len(offsets) == 3, "Expected three offsets arrays"
|
||||
ring_offsets, polygon_offsets, geom_offsets = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
|
||||
_parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
|
||||
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2, mask=mask)
|
||||
parr = parr.cast(_multipolygon_type(_parr.type))
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported type for geoarrow: {geom_type}")
|
||||
|
||||
|
||||
## GeoArrow -> GeoPandas
|
||||
|
||||
|
||||
def _get_arrow_geometry_field(field):
|
||||
if (meta := field.metadata) is not None:
|
||||
if (ext_name := meta.get(b"ARROW:extension:name", None)) is not None:
|
||||
if ext_name.startswith(b"geoarrow."):
|
||||
if (
|
||||
ext_meta := meta.get(b"ARROW:extension:metadata", None)
|
||||
) is not None:
|
||||
ext_meta = json.loads(ext_meta.decode())
|
||||
return ext_name.decode(), ext_meta
|
||||
|
||||
if isinstance(field.type, pa.ExtensionType):
|
||||
ext_name = field.type.extension_name
|
||||
if ext_name.startswith("geoarrow."):
|
||||
ext_meta_ser = field.type.__arrow_ext_serialize__()
|
||||
if ext_meta_ser:
|
||||
ext_meta = json.loads(ext_meta_ser.decode())
|
||||
else:
|
||||
ext_meta = None
|
||||
return ext_name, ext_meta
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def arrow_to_geopandas(table, geometry=None):
|
||||
"""
|
||||
Convert Arrow table object to a GeoDataFrame based on GeoArrow extension types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : pyarrow.Table
|
||||
The Arrow table to convert.
|
||||
geometry : str, default None
|
||||
The name of the geometry column to set as the active geometry
|
||||
column. If None, the first geometry column found will be used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
"""
|
||||
if not isinstance(table, pa.Table):
|
||||
table = pa.table(table)
|
||||
|
||||
geom_fields = []
|
||||
|
||||
for i, field in enumerate(table.schema):
|
||||
geom = _get_arrow_geometry_field(field)
|
||||
if geom is not None:
|
||||
geom_fields.append((i, field.name, *geom))
|
||||
|
||||
if len(geom_fields) == 0:
|
||||
raise ValueError("No geometry column found in the Arrow table.")
|
||||
|
||||
table_attr = table.drop([f[1] for f in geom_fields])
|
||||
df = table_attr.to_pandas()
|
||||
|
||||
for i, col, ext_name, ext_meta in geom_fields:
|
||||
crs = None
|
||||
if ext_meta is not None and "crs" in ext_meta:
|
||||
crs = ext_meta["crs"]
|
||||
|
||||
if ext_name == "geoarrow.wkb":
|
||||
geom_arr = from_wkb(np.array(table[col]), crs=crs)
|
||||
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
|
||||
|
||||
geom_arr = from_shapely(
|
||||
construct_shapely_array(table[col].combine_chunks(), ext_name), crs=crs
|
||||
)
|
||||
else:
|
||||
raise TypeError(f"Unknown GeoArrow extension type: {ext_name}")
|
||||
|
||||
df.insert(i, col, geom_arr)
|
||||
|
||||
return GeoDataFrame(df, geometry=geometry or geom_fields[0][1])
|
||||
|
||||
|
||||
def arrow_to_geometry_array(arr):
|
||||
"""
|
||||
Convert Arrow array object (representing single GeoArrow array) to a
|
||||
geopandas GeometryArray.
|
||||
|
||||
Specifically for GeoSeries.from_arrow.
|
||||
"""
|
||||
if Version(pa.__version__) < Version("14.0.0"):
|
||||
raise ValueError("Importing from Arrow requires pyarrow >= 14.0.")
|
||||
|
||||
schema_capsule, array_capsule = arr.__arrow_c_array__()
|
||||
field = pa.Field._import_from_c_capsule(schema_capsule)
|
||||
pa_arr = pa.Array._import_from_c_capsule(field.__arrow_c_schema__(), array_capsule)
|
||||
|
||||
geom_info = _get_arrow_geometry_field(field)
|
||||
if geom_info is None:
|
||||
raise ValueError("No GeoArrow geometry field found.")
|
||||
ext_name, ext_meta = geom_info
|
||||
|
||||
crs = None
|
||||
if ext_meta is not None and "crs" in ext_meta:
|
||||
crs = ext_meta["crs"]
|
||||
|
||||
if ext_name == "geoarrow.wkb":
|
||||
geom_arr = from_wkb(np.array(pa_arr), crs=crs)
|
||||
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
|
||||
|
||||
geom_arr = from_shapely(construct_shapely_array(pa_arr, ext_name), crs=crs)
|
||||
else:
|
||||
raise ValueError(f"Unknown GeoArrow extension type: {ext_name}")
|
||||
|
||||
return geom_arr
|
||||
|
||||
|
||||
def _get_inner_coords(arr):
|
||||
if pa.types.is_struct(arr.type):
|
||||
if arr.type.num_fields == 2:
|
||||
coords = np.column_stack(
|
||||
[np.asarray(arr.field("x")), np.asarray(arr.field("y"))]
|
||||
)
|
||||
else:
|
||||
coords = np.column_stack(
|
||||
[
|
||||
np.asarray(arr.field("x")),
|
||||
np.asarray(arr.field("y")),
|
||||
np.asarray(arr.field("z")),
|
||||
]
|
||||
)
|
||||
return coords
|
||||
else:
|
||||
# fixed size list
|
||||
return np.asarray(arr.values).reshape(len(arr), -1)
|
||||
|
||||
|
||||
def construct_shapely_array(arr: pa.Array, extension_name: str):
|
||||
"""
|
||||
Construct a NumPy array of shapely geometries from a pyarrow.Array
|
||||
with GeoArrow extension type.
|
||||
|
||||
"""
|
||||
if isinstance(arr, pa.ExtensionArray):
|
||||
arr = arr.storage
|
||||
|
||||
if extension_name == "geoarrow.point":
|
||||
coords = _get_inner_coords(arr)
|
||||
result = shapely.from_ragged_array(GeometryType.POINT, coords, None)
|
||||
|
||||
elif extension_name == "geoarrow.linestring":
|
||||
coords = _get_inner_coords(arr.values)
|
||||
offsets1 = np.asarray(arr.offsets)
|
||||
offsets = (offsets1,)
|
||||
result = shapely.from_ragged_array(GeometryType.LINESTRING, coords, offsets)
|
||||
|
||||
elif extension_name == "geoarrow.polygon":
|
||||
coords = _get_inner_coords(arr.values.values)
|
||||
offsets2 = np.asarray(arr.offsets)
|
||||
offsets1 = np.asarray(arr.values.offsets)
|
||||
offsets = (offsets1, offsets2)
|
||||
result = shapely.from_ragged_array(GeometryType.POLYGON, coords, offsets)
|
||||
|
||||
elif extension_name == "geoarrow.multipoint":
|
||||
coords = _get_inner_coords(arr.values)
|
||||
offsets1 = np.asarray(arr.offsets)
|
||||
offsets = (offsets1,)
|
||||
result = shapely.from_ragged_array(GeometryType.MULTIPOINT, coords, offsets)
|
||||
|
||||
elif extension_name == "geoarrow.multilinestring":
|
||||
coords = _get_inner_coords(arr.values.values)
|
||||
offsets2 = np.asarray(arr.offsets)
|
||||
offsets1 = np.asarray(arr.values.offsets)
|
||||
offsets = (offsets1, offsets2)
|
||||
result = shapely.from_ragged_array(
|
||||
GeometryType.MULTILINESTRING, coords, offsets
|
||||
)
|
||||
|
||||
elif extension_name == "geoarrow.multipolygon":
|
||||
coords = _get_inner_coords(arr.values.values.values)
|
||||
offsets3 = np.asarray(arr.offsets)
|
||||
offsets2 = np.asarray(arr.values.offsets)
|
||||
offsets1 = np.asarray(arr.values.values.offsets)
|
||||
offsets = (offsets1, offsets2, offsets3)
|
||||
result = shapely.from_ragged_array(GeometryType.MULTIPOLYGON, coords, offsets)
|
||||
|
||||
else:
|
||||
raise ValueError(extension_name)
|
||||
|
||||
# apply validity mask
|
||||
if arr.null_count:
|
||||
mask = np.asarray(arr.is_null())
|
||||
result = np.where(mask, None, result)
|
||||
|
||||
return result
|
||||
@@ -1,72 +0,0 @@
|
||||
from packaging.version import Version
|
||||
|
||||
import pyarrow
|
||||
|
||||
_ERROR_MSG = """\
|
||||
Disallowed deserialization of 'arrow.py_extension_type':
|
||||
storage_type = {storage_type}
|
||||
serialized = {serialized}
|
||||
pickle disassembly:\n{pickle_disassembly}
|
||||
|
||||
Reading of untrusted Parquet or Feather files with a PyExtensionType column
|
||||
allows arbitrary code execution.
|
||||
If you trust this file, you can enable reading the extension type by one of:
|
||||
|
||||
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
|
||||
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
|
||||
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
|
||||
|
||||
We strongly recommend updating your Parquet/Feather files to use extension types
|
||||
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
|
||||
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
|
||||
for more details.
|
||||
"""
|
||||
|
||||
|
||||
def patch_pyarrow():
|
||||
# starting from pyarrow 14.0.1, it has its own mechanism
|
||||
if Version(pyarrow.__version__) >= Version("14.0.1"):
|
||||
return
|
||||
|
||||
# if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
|
||||
# installed, use this instead (which also ensures it works if they had
|
||||
# called `pyarrow_hotfix.uninstall()`)
|
||||
try:
|
||||
import pyarrow_hotfix # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
return
|
||||
|
||||
# if the hotfix is already installed and enabled
|
||||
if getattr(pyarrow, "_hotfix_installed", False):
|
||||
return
|
||||
|
||||
class ForbiddenExtensionType(pyarrow.ExtensionType):
|
||||
def __arrow_ext_serialize__(self):
|
||||
return b""
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized):
|
||||
import io
|
||||
import pickletools
|
||||
|
||||
out = io.StringIO()
|
||||
pickletools.dis(serialized, out)
|
||||
raise RuntimeError(
|
||||
_ERROR_MSG.format(
|
||||
storage_type=storage_type,
|
||||
serialized=serialized,
|
||||
pickle_disassembly=out.getvalue(),
|
||||
)
|
||||
)
|
||||
|
||||
pyarrow.unregister_extension_type("arrow.py_extension_type")
|
||||
pyarrow.register_extension_type(
|
||||
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
|
||||
)
|
||||
|
||||
pyarrow._hotfix_installed = True
|
||||
|
||||
|
||||
patch_pyarrow()
|
||||
@@ -1,913 +0,0 @@
|
||||
import json
|
||||
import warnings
|
||||
from packaging.version import Version
|
||||
|
||||
import numpy as np
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
import shapely
|
||||
|
||||
import geopandas
|
||||
from geopandas import GeoDataFrame
|
||||
from geopandas._compat import import_optional_dependency
|
||||
from geopandas.array import from_shapely, from_wkb
|
||||
|
||||
from .file import _expand_user
|
||||
|
||||
METADATA_VERSION = "1.0.0"
|
||||
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
|
||||
GEOARROW_ENCODINGS = [
|
||||
"point",
|
||||
"linestring",
|
||||
"polygon",
|
||||
"multipoint",
|
||||
"multilinestring",
|
||||
"multipolygon",
|
||||
]
|
||||
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
|
||||
|
||||
# reference: https://github.com/opengeospatial/geoparquet
|
||||
|
||||
# Metadata structure:
|
||||
# {
|
||||
# "geo": {
|
||||
# "columns": {
|
||||
# "<name>": {
|
||||
# "encoding": "WKB"
|
||||
# "geometry_types": <list of str: REQUIRED>
|
||||
# "crs": "<PROJJSON or None: OPTIONAL>",
|
||||
# "orientation": "<'counterclockwise' or None: OPTIONAL>"
|
||||
# "edges": "planar"
|
||||
# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
|
||||
# "epoch": <float: OPTIONAL>
|
||||
# }
|
||||
# },
|
||||
# "primary_column": "<str: REQUIRED>",
|
||||
# "version": "<METADATA_VERSION>",
|
||||
#
|
||||
# # Additional GeoPandas specific metadata (not in metadata spec)
|
||||
# "creator": {
|
||||
# "library": "geopandas",
|
||||
# "version": "<geopandas.__version__>"
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
|
||||
def _is_fsspec_url(url):
|
||||
return (
|
||||
isinstance(url, str)
|
||||
and "://" in url
|
||||
and not url.startswith(("http://", "https://"))
|
||||
)
|
||||
|
||||
|
||||
def _remove_id_from_member_of_ensembles(json_dict):
|
||||
"""
|
||||
Older PROJ versions will not recognize IDs of datum ensemble members that
|
||||
were added in more recent PROJ database versions.
|
||||
|
||||
Cf https://github.com/opengeospatial/geoparquet/discussions/110
|
||||
and https://github.com/OSGeo/PROJ/pull/3221
|
||||
|
||||
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
|
||||
"""
|
||||
for key, value in json_dict.items():
|
||||
if isinstance(value, dict):
|
||||
_remove_id_from_member_of_ensembles(value)
|
||||
elif key == "members" and isinstance(value, list):
|
||||
for member in value:
|
||||
member.pop("id", None)
|
||||
|
||||
|
||||
# type ids 0 to 7
|
||||
_geometry_type_names = [
|
||||
"Point",
|
||||
"LineString",
|
||||
"LineString",
|
||||
"Polygon",
|
||||
"MultiPoint",
|
||||
"MultiLineString",
|
||||
"MultiPolygon",
|
||||
"GeometryCollection",
|
||||
]
|
||||
_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
|
||||
|
||||
|
||||
def _get_geometry_types(series):
|
||||
"""
|
||||
Get unique geometry types from a GeoSeries.
|
||||
"""
|
||||
arr_geometry_types = shapely.get_type_id(series.array._data)
|
||||
# ensure to include "... Z" for 3D geometries
|
||||
has_z = shapely.has_z(series.array._data)
|
||||
arr_geometry_types[has_z] += 8
|
||||
|
||||
geometry_types = Series(arr_geometry_types).unique().tolist()
|
||||
# drop missing values (shapely.get_type_id returns -1 for those)
|
||||
if -1 in geometry_types:
|
||||
geometry_types.remove(-1)
|
||||
|
||||
return sorted([_geometry_type_names[idx] for idx in geometry_types])
|
||||
|
||||
|
||||
def _create_metadata(
|
||||
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
|
||||
):
|
||||
"""Create and encode geo metadata dict.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame
|
||||
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
|
||||
GeoParquet specification version; if not provided will default to
|
||||
latest supported version.
|
||||
write_covering_bbox : bool, default False
|
||||
Writes the bounding box column for each row entry with column
|
||||
name 'bbox'. Writing a bbox column can be computationally
|
||||
expensive, hence is default setting is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
"""
|
||||
if schema_version is None:
|
||||
if geometry_encoding and any(
|
||||
encoding != "WKB" for encoding in geometry_encoding.values()
|
||||
):
|
||||
schema_version = "1.1.0"
|
||||
else:
|
||||
schema_version = METADATA_VERSION
|
||||
|
||||
if schema_version not in SUPPORTED_VERSIONS:
|
||||
raise ValueError(
|
||||
f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
|
||||
)
|
||||
|
||||
# Construct metadata for each geometry
|
||||
column_metadata = {}
|
||||
for col in df.columns[df.dtypes == "geometry"]:
|
||||
series = df[col]
|
||||
|
||||
geometry_types = _get_geometry_types(series)
|
||||
if schema_version[0] == "0":
|
||||
geometry_types_name = "geometry_type"
|
||||
if len(geometry_types) == 1:
|
||||
geometry_types = geometry_types[0]
|
||||
else:
|
||||
geometry_types_name = "geometry_types"
|
||||
|
||||
crs = None
|
||||
if series.crs:
|
||||
if schema_version == "0.1.0":
|
||||
crs = series.crs.to_wkt()
|
||||
else: # version >= 0.4.0
|
||||
crs = series.crs.to_json_dict()
|
||||
_remove_id_from_member_of_ensembles(crs)
|
||||
|
||||
column_metadata[col] = {
|
||||
"encoding": geometry_encoding[col],
|
||||
"crs": crs,
|
||||
geometry_types_name: geometry_types,
|
||||
}
|
||||
|
||||
bbox = series.total_bounds.tolist()
|
||||
if np.isfinite(bbox).all():
|
||||
# don't add bbox with NaNs for empty / all-NA geometry column
|
||||
column_metadata[col]["bbox"] = bbox
|
||||
|
||||
if write_covering_bbox:
|
||||
column_metadata[col]["covering"] = {
|
||||
"bbox": {
|
||||
"xmin": ["bbox", "xmin"],
|
||||
"ymin": ["bbox", "ymin"],
|
||||
"xmax": ["bbox", "xmax"],
|
||||
"ymax": ["bbox", "ymax"],
|
||||
},
|
||||
}
|
||||
|
||||
return {
|
||||
"primary_column": df._geometry_column_name,
|
||||
"columns": column_metadata,
|
||||
"version": schema_version,
|
||||
"creator": {"library": "geopandas", "version": geopandas.__version__},
|
||||
}
|
||||
|
||||
|
||||
def _encode_metadata(metadata):
|
||||
"""Encode metadata dict to UTF-8 JSON string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metadata : dict
|
||||
|
||||
Returns
|
||||
-------
|
||||
UTF-8 encoded JSON string
|
||||
"""
|
||||
return json.dumps(metadata).encode("utf-8")
|
||||
|
||||
|
||||
def _decode_metadata(metadata_str):
|
||||
"""Decode a UTF-8 encoded JSON string to dict
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metadata_str : string (UTF-8 encoded)
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
"""
|
||||
if metadata_str is None:
|
||||
return None
|
||||
|
||||
return json.loads(metadata_str.decode("utf-8"))
|
||||
|
||||
|
||||
def _validate_dataframe(df):
|
||||
"""Validate that the GeoDataFrame conforms to requirements for writing
|
||||
to Parquet format.
|
||||
|
||||
Raises `ValueError` if the GeoDataFrame is not valid.
|
||||
|
||||
copied from `pandas.io.parquet`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame
|
||||
"""
|
||||
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
|
||||
|
||||
# must have value column names (strings only)
|
||||
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
|
||||
raise ValueError("Writing to Parquet/Feather requires string column names")
|
||||
|
||||
# index level names must be strings
|
||||
valid_names = all(
|
||||
isinstance(name, str) for name in df.index.names if name is not None
|
||||
)
|
||||
if not valid_names:
|
||||
raise ValueError("Index level names must be strings")
|
||||
|
||||
|
||||
def _validate_geo_metadata(metadata):
|
||||
"""Validate geo metadata.
|
||||
Must not be empty, and must contain the structure specified above.
|
||||
|
||||
Raises ValueError if metadata is not valid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metadata : dict
|
||||
"""
|
||||
|
||||
if not metadata:
|
||||
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
|
||||
|
||||
# version was schema_version in 0.1.0
|
||||
version = metadata.get("version", metadata.get("schema_version"))
|
||||
if not version:
|
||||
raise ValueError(
|
||||
"'geo' metadata in Parquet/Feather file is missing required key: "
|
||||
"'version'"
|
||||
)
|
||||
|
||||
required_keys = ("primary_column", "columns")
|
||||
for key in required_keys:
|
||||
if metadata.get(key, None) is None:
|
||||
raise ValueError(
|
||||
"'geo' metadata in Parquet/Feather file is missing required key: "
|
||||
"'{key}'".format(key=key)
|
||||
)
|
||||
|
||||
if not isinstance(metadata["columns"], dict):
|
||||
raise ValueError("'columns' in 'geo' metadata must be a dict")
|
||||
|
||||
# Validate that geometry columns have required metadata and values
|
||||
# leaving out "geometry_type" for compatibility with 0.1
|
||||
required_col_keys = ("encoding",)
|
||||
for col, column_metadata in metadata["columns"].items():
|
||||
for key in required_col_keys:
|
||||
if key not in column_metadata:
|
||||
raise ValueError(
|
||||
"'geo' metadata in Parquet/Feather file is missing required key "
|
||||
"'{key}' for column '{col}'".format(key=key, col=col)
|
||||
)
|
||||
|
||||
if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
|
||||
raise ValueError(
|
||||
"Only WKB geometry encoding or one of the native encodings "
|
||||
f"({GEOARROW_ENCODINGS!r}) are supported, "
|
||||
f"got: {column_metadata['encoding']}"
|
||||
)
|
||||
|
||||
if column_metadata.get("edges", "planar") == "spherical":
|
||||
warnings.warn(
|
||||
f"The geo metadata indicate that column '{col}' has spherical edges, "
|
||||
"but because GeoPandas currently does not support spherical "
|
||||
"geometry, it ignores this metadata and will interpret the edges of "
|
||||
"the geometries as planar.",
|
||||
UserWarning,
|
||||
stacklevel=4,
|
||||
)
|
||||
|
||||
if "covering" in column_metadata:
|
||||
covering = column_metadata["covering"]
|
||||
if "bbox" in covering:
|
||||
bbox = covering["bbox"]
|
||||
for var in ["xmin", "ymin", "xmax", "ymax"]:
|
||||
if var not in bbox.keys():
|
||||
raise ValueError("Metadata for bbox column is malformed.")
|
||||
|
||||
|
||||
def _geopandas_to_arrow(
|
||||
df,
|
||||
index=None,
|
||||
geometry_encoding="WKB",
|
||||
schema_version=None,
|
||||
write_covering_bbox=None,
|
||||
):
|
||||
"""
|
||||
Helper function with main, shared logic for to_parquet/to_feather.
|
||||
"""
|
||||
from pyarrow import StructArray
|
||||
|
||||
from geopandas.io._geoarrow import geopandas_to_arrow
|
||||
|
||||
_validate_dataframe(df)
|
||||
|
||||
if schema_version is not None:
|
||||
if geometry_encoding != "WKB" and schema_version != "1.1.0":
|
||||
raise ValueError(
|
||||
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
|
||||
)
|
||||
|
||||
table, geometry_encoding_dict = geopandas_to_arrow(
|
||||
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
|
||||
)
|
||||
geo_metadata = _create_metadata(
|
||||
df,
|
||||
schema_version=schema_version,
|
||||
geometry_encoding=geometry_encoding_dict,
|
||||
write_covering_bbox=write_covering_bbox,
|
||||
)
|
||||
|
||||
if write_covering_bbox:
|
||||
if "bbox" in df.columns:
|
||||
raise ValueError(
|
||||
"An existing column 'bbox' already exists in the dataframe. "
|
||||
"Please rename to write covering bbox."
|
||||
)
|
||||
bounds = df.bounds
|
||||
bbox_array = StructArray.from_arrays(
|
||||
[bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
|
||||
names=["xmin", "ymin", "xmax", "ymax"],
|
||||
)
|
||||
table = table.append_column("bbox", bbox_array)
|
||||
|
||||
# Store geopandas specific file-level metadata
|
||||
# This must be done AFTER creating the table or it is not persisted
|
||||
metadata = table.schema.metadata
|
||||
metadata.update({b"geo": _encode_metadata(geo_metadata)})
|
||||
|
||||
return table.replace_schema_metadata(metadata)
|
||||
|
||||
|
||||
def _to_parquet(
|
||||
df,
|
||||
path,
|
||||
index=None,
|
||||
compression="snappy",
|
||||
geometry_encoding="WKB",
|
||||
schema_version=None,
|
||||
write_covering_bbox=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Write a GeoDataFrame to the Parquet format.
|
||||
|
||||
Any geometry columns present are serialized to WKB format in the file.
|
||||
|
||||
Requires 'pyarrow'.
|
||||
|
||||
This is tracking version 1.0.0 of the GeoParquet specification at:
|
||||
https://github.com/opengeospatial/geoparquet. Writing older versions is
|
||||
supported using the `schema_version` keyword.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
index : bool, default None
|
||||
If ``True``, always include the dataframe's index(es) as columns
|
||||
in the file output.
|
||||
If ``False``, the index(es) will not be written to the file.
|
||||
If ``None``, the index(ex) will be included as columns in the file
|
||||
output except `RangeIndex` which is stored as metadata only.
|
||||
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
|
||||
Name of the compression to use. Use ``None`` for no compression.
|
||||
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
|
||||
The encoding to use for the geometry columns. Defaults to "WKB"
|
||||
for maximum interoperability. Specify "geoarrow" to use one of the
|
||||
native GeoArrow-based single-geometry type encodings.
|
||||
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
|
||||
GeoParquet specification version; if not provided will default to
|
||||
latest supported version.
|
||||
write_covering_bbox : bool, default False
|
||||
Writes the bounding box column for each row entry with column
|
||||
name 'bbox'. Writing a bbox column can be computationally
|
||||
expensive, hence is default setting is False.
|
||||
**kwargs
|
||||
Additional keyword arguments passed to pyarrow.parquet.write_table().
|
||||
"""
|
||||
parquet = import_optional_dependency(
|
||||
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
|
||||
)
|
||||
|
||||
path = _expand_user(path)
|
||||
table = _geopandas_to_arrow(
|
||||
df,
|
||||
index=index,
|
||||
geometry_encoding=geometry_encoding,
|
||||
schema_version=schema_version,
|
||||
write_covering_bbox=write_covering_bbox,
|
||||
)
|
||||
parquet.write_table(table, path, compression=compression, **kwargs)
|
||||
|
||||
|
||||
def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
|
||||
"""
|
||||
Write a GeoDataFrame to the Feather format.
|
||||
|
||||
Any geometry columns present are serialized to WKB format in the file.
|
||||
|
||||
Requires 'pyarrow' >= 0.17.
|
||||
|
||||
This is tracking version 1.0.0 of the GeoParquet specification for
|
||||
the metadata at: https://github.com/opengeospatial/geoparquet. Writing
|
||||
older versions is supported using the `schema_version` keyword.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
index : bool, default None
|
||||
If ``True``, always include the dataframe's index(es) as columns
|
||||
in the file output.
|
||||
If ``False``, the index(es) will not be written to the file.
|
||||
If ``None``, the index(ex) will be included as columns in the file
|
||||
output except `RangeIndex` which is stored as metadata only.
|
||||
compression : {'zstd', 'lz4', 'uncompressed'}, optional
|
||||
Name of the compression to use. Use ``"uncompressed"`` for no
|
||||
compression. By default uses LZ4 if available, otherwise uncompressed.
|
||||
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
|
||||
GeoParquet specification version for the metadata; if not provided
|
||||
will default to latest supported version.
|
||||
kwargs
|
||||
Additional keyword arguments passed to pyarrow.feather.write_feather().
|
||||
"""
|
||||
feather = import_optional_dependency(
|
||||
"pyarrow.feather", extra="pyarrow is required for Feather support."
|
||||
)
|
||||
# TODO move this into `import_optional_dependency`
|
||||
import pyarrow
|
||||
|
||||
if Version(pyarrow.__version__) < Version("0.17.0"):
|
||||
raise ImportError("pyarrow >= 0.17 required for Feather support")
|
||||
|
||||
path = _expand_user(path)
|
||||
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
|
||||
feather.write_feather(table, path, compression=compression, **kwargs)
|
||||
|
||||
|
||||
def _arrow_to_geopandas(table, geo_metadata=None):
|
||||
"""
|
||||
Helper function with main, shared logic for read_parquet/read_feather.
|
||||
"""
|
||||
if geo_metadata is None:
|
||||
# Note: this path of not passing metadata is also used by dask-geopandas
|
||||
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
|
||||
|
||||
# Find all geometry columns that were read from the file. May
|
||||
# be a subset if 'columns' parameter is used.
|
||||
geometry_columns = [
|
||||
col for col in geo_metadata["columns"] if col in table.column_names
|
||||
]
|
||||
result_column_names = list(table.slice(0, 0).to_pandas().columns)
|
||||
geometry_columns.sort(key=result_column_names.index)
|
||||
|
||||
if not len(geometry_columns):
|
||||
raise ValueError(
|
||||
"""No geometry columns are included in the columns read from
|
||||
the Parquet/Feather file. To read this file without geometry columns,
|
||||
use pandas.read_parquet/read_feather() instead."""
|
||||
)
|
||||
|
||||
geometry = geo_metadata["primary_column"]
|
||||
|
||||
# Missing geometry likely indicates a subset of columns was read;
|
||||
# promote the first available geometry to the primary geometry.
|
||||
if len(geometry_columns) and geometry not in geometry_columns:
|
||||
geometry = geometry_columns[0]
|
||||
|
||||
# if there are multiple non-primary geometry columns, raise a warning
|
||||
if len(geometry_columns) > 1:
|
||||
warnings.warn(
|
||||
"Multiple non-primary geometry columns read from Parquet/Feather "
|
||||
"file. The first column read was promoted to the primary geometry.",
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
table_attr = table.drop(geometry_columns)
|
||||
df = table_attr.to_pandas()
|
||||
|
||||
# Convert the WKB columns that are present back to geometry.
|
||||
for col in geometry_columns:
|
||||
col_metadata = geo_metadata["columns"][col]
|
||||
if "crs" in col_metadata:
|
||||
crs = col_metadata["crs"]
|
||||
if isinstance(crs, dict):
|
||||
_remove_id_from_member_of_ensembles(crs)
|
||||
else:
|
||||
# per the GeoParquet spec, missing CRS is to be interpreted as
|
||||
# OGC:CRS84
|
||||
crs = "OGC:CRS84"
|
||||
|
||||
if col_metadata["encoding"] == "WKB":
|
||||
geom_arr = from_wkb(np.array(table[col]), crs=crs)
|
||||
else:
|
||||
from geopandas.io._geoarrow import construct_shapely_array
|
||||
|
||||
geom_arr = from_shapely(
|
||||
construct_shapely_array(
|
||||
table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
|
||||
),
|
||||
crs=crs,
|
||||
)
|
||||
|
||||
df.insert(result_column_names.index(col), col, geom_arr)
|
||||
|
||||
return GeoDataFrame(df, geometry=geometry)
|
||||
|
||||
|
||||
def _get_filesystem_path(path, filesystem=None, storage_options=None):
|
||||
"""
|
||||
Get the filesystem and path for a given filesystem and path.
|
||||
|
||||
If the filesystem is not None then it's just returned as is.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
if (
|
||||
isinstance(path, str)
|
||||
and storage_options is None
|
||||
and filesystem is None
|
||||
and Version(pyarrow.__version__) >= Version("5.0.0")
|
||||
):
|
||||
# Use the native pyarrow filesystem if possible.
|
||||
try:
|
||||
from pyarrow.fs import FileSystem
|
||||
|
||||
filesystem, path = FileSystem.from_uri(path)
|
||||
except Exception:
|
||||
# fallback to use get_handle / fsspec for filesystems
|
||||
# that pyarrow doesn't support
|
||||
pass
|
||||
|
||||
if _is_fsspec_url(path) and filesystem is None:
|
||||
fsspec = import_optional_dependency(
|
||||
"fsspec", extra="fsspec is requred for 'storage_options'."
|
||||
)
|
||||
filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
|
||||
|
||||
if filesystem is None and storage_options:
|
||||
raise ValueError(
|
||||
"Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
|
||||
)
|
||||
|
||||
return filesystem, path
|
||||
|
||||
|
||||
def _ensure_arrow_fs(filesystem):
|
||||
"""
|
||||
Simplified version of pyarrow.fs._ensure_filesystem. This is only needed
|
||||
below because `pyarrow.parquet.read_metadata` does not yet accept a
|
||||
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
|
||||
"""
|
||||
from pyarrow import fs
|
||||
|
||||
if isinstance(filesystem, fs.FileSystem):
|
||||
return filesystem
|
||||
|
||||
# handle fsspec-compatible filesystems
|
||||
try:
|
||||
import fsspec
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
if isinstance(filesystem, fsspec.AbstractFileSystem):
|
||||
return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
|
||||
|
||||
return filesystem
|
||||
|
||||
|
||||
def _validate_and_decode_metadata(metadata):
|
||||
if metadata is None or b"geo" not in metadata:
|
||||
raise ValueError(
|
||||
"""Missing geo metadata in Parquet/Feather file.
|
||||
Use pandas.read_parquet/read_feather() instead."""
|
||||
)
|
||||
|
||||
# check for malformed metadata
|
||||
try:
|
||||
decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
|
||||
except (TypeError, json.decoder.JSONDecodeError):
|
||||
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
|
||||
|
||||
_validate_geo_metadata(decoded_geo_metadata)
|
||||
return decoded_geo_metadata
|
||||
|
||||
|
||||
def _read_parquet_schema_and_metadata(path, filesystem):
|
||||
"""
|
||||
Opening the Parquet file/dataset a first time to get the schema and metadata.
|
||||
|
||||
TODO: we should look into how we can reuse opened dataset for reading the
|
||||
actual data, to avoid discovering the dataset twice (problem right now is
|
||||
that the ParquetDataset interface doesn't allow passing the filters on read)
|
||||
|
||||
"""
|
||||
import pyarrow
|
||||
from pyarrow import parquet
|
||||
|
||||
kwargs = {}
|
||||
if Version(pyarrow.__version__) < Version("15.0.0"):
|
||||
kwargs = dict(use_legacy_dataset=False)
|
||||
|
||||
try:
|
||||
schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
|
||||
except Exception:
|
||||
schema = parquet.read_schema(path, filesystem=filesystem)
|
||||
|
||||
metadata = schema.metadata
|
||||
|
||||
# read metadata separately to get the raw Parquet FileMetaData metadata
|
||||
# (pyarrow doesn't properly exposes those in schema.metadata for files
|
||||
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
|
||||
if metadata is None or b"geo" not in metadata:
|
||||
try:
|
||||
metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return schema, metadata
|
||||
|
||||
|
||||
def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
|
||||
"""
|
||||
Load a Parquet object from the file path, returning a GeoDataFrame.
|
||||
|
||||
You can read a subset of columns in the file using the ``columns`` parameter.
|
||||
However, the structure of the returned GeoDataFrame will depend on which
|
||||
columns you read:
|
||||
|
||||
* if no geometry columns are read, this will raise a ``ValueError`` - you
|
||||
should use the pandas `read_parquet` method instead.
|
||||
* if the primary geometry column saved to this file is not included in
|
||||
columns, the first available geometry column will be set as the geometry
|
||||
column of the returned GeoDataFrame.
|
||||
|
||||
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
|
||||
specification at: https://github.com/opengeospatial/geoparquet
|
||||
|
||||
If 'crs' key is not present in the GeoParquet metadata associated with the
|
||||
Parquet object, it will default to "OGC:CRS84" according to the specification.
|
||||
|
||||
Requires 'pyarrow'.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
columns : list-like of strings, default=None
|
||||
If not None, only these columns will be read from the file. If
|
||||
the primary geometry column is not included, the first secondary
|
||||
geometry read from the file will be set as the geometry column
|
||||
of the returned GeoDataFrame. If no geometry columns are present,
|
||||
a ``ValueError`` will be raised.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g. host,
|
||||
port, username, password, etc. For HTTP(S) URLs the key-value pairs are
|
||||
forwarded to urllib as header options. For other URLs (e.g. starting with
|
||||
"s3://", and "gcs://") the key-value pairs are forwarded to fsspec. Please
|
||||
see fsspec and urllib for more details.
|
||||
|
||||
When no storage options are provided and a filesystem is implemented by
|
||||
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
|
||||
filesystem is preferred. Provide the instantiated fsspec filesystem using
|
||||
the ``filesystem`` keyword if you wish to use its implementation.
|
||||
bbox : tuple, optional
|
||||
Bounding box to be used to filter selection from geoparquet data. This
|
||||
is only usable if the data was saved with the bbox covering metadata.
|
||||
Input is of the tuple format (xmin, ymin, xmax, ymax).
|
||||
|
||||
**kwargs
|
||||
Any additional kwargs passed to :func:`pyarrow.parquet.read_table`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = geopandas.read_parquet("data.parquet") # doctest: +SKIP
|
||||
|
||||
Specifying columns to read:
|
||||
|
||||
>>> df = geopandas.read_parquet(
|
||||
... "data.parquet",
|
||||
... columns=["geometry", "pop_est"]
|
||||
... ) # doctest: +SKIP
|
||||
"""
|
||||
|
||||
parquet = import_optional_dependency(
|
||||
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
|
||||
)
|
||||
import geopandas.io._pyarrow_hotfix # noqa: F401
|
||||
|
||||
# TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
|
||||
# adds filesystem as a keyword and match that.
|
||||
filesystem = kwargs.pop("filesystem", None)
|
||||
filesystem, path = _get_filesystem_path(
|
||||
path, filesystem=filesystem, storage_options=storage_options
|
||||
)
|
||||
path = _expand_user(path)
|
||||
schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
|
||||
|
||||
geo_metadata = _validate_and_decode_metadata(metadata)
|
||||
|
||||
bbox_filter = (
|
||||
_get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
|
||||
)
|
||||
|
||||
if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
|
||||
|
||||
# by default, bbox column is not read in, so must specify which
|
||||
# columns are read in if it exists.
|
||||
if not columns and if_bbox_column_exists:
|
||||
columns = _get_non_bbox_columns(schema, geo_metadata)
|
||||
|
||||
# if both bbox and filters kwargs are used, must splice together.
|
||||
if "filters" in kwargs:
|
||||
filters_kwarg = kwargs.pop("filters")
|
||||
filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
|
||||
else:
|
||||
filters = bbox_filter
|
||||
|
||||
kwargs["use_pandas_metadata"] = True
|
||||
|
||||
table = parquet.read_table(
|
||||
path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
|
||||
)
|
||||
|
||||
return _arrow_to_geopandas(table, geo_metadata)
|
||||
|
||||
|
||||
def _read_feather(path, columns=None, **kwargs):
|
||||
"""
|
||||
Load a Feather object from the file path, returning a GeoDataFrame.
|
||||
|
||||
You can read a subset of columns in the file using the ``columns`` parameter.
|
||||
However, the structure of the returned GeoDataFrame will depend on which
|
||||
columns you read:
|
||||
|
||||
* if no geometry columns are read, this will raise a ``ValueError`` - you
|
||||
should use the pandas `read_feather` method instead.
|
||||
* if the primary geometry column saved to this file is not included in
|
||||
columns, the first available geometry column will be set as the geometry
|
||||
column of the returned GeoDataFrame.
|
||||
|
||||
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
|
||||
specification at: https://github.com/opengeospatial/geoparquet
|
||||
|
||||
If 'crs' key is not present in the Feather metadata associated with the
|
||||
Parquet object, it will default to "OGC:CRS84" according to the specification.
|
||||
|
||||
Requires 'pyarrow' >= 0.17.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
columns : list-like of strings, default=None
|
||||
If not None, only these columns will be read from the file. If
|
||||
the primary geometry column is not included, the first secondary
|
||||
geometry read from the file will be set as the geometry column
|
||||
of the returned GeoDataFrame. If no geometry columns are present,
|
||||
a ``ValueError`` will be raised.
|
||||
**kwargs
|
||||
Any additional kwargs passed to pyarrow.feather.read_table().
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = geopandas.read_feather("data.feather") # doctest: +SKIP
|
||||
|
||||
Specifying columns to read:
|
||||
|
||||
>>> df = geopandas.read_feather(
|
||||
... "data.feather",
|
||||
... columns=["geometry", "pop_est"]
|
||||
... ) # doctest: +SKIP
|
||||
"""
|
||||
|
||||
feather = import_optional_dependency(
|
||||
"pyarrow.feather", extra="pyarrow is required for Feather support."
|
||||
)
|
||||
# TODO move this into `import_optional_dependency`
|
||||
import pyarrow
|
||||
|
||||
import geopandas.io._pyarrow_hotfix # noqa: F401
|
||||
|
||||
if Version(pyarrow.__version__) < Version("0.17.0"):
|
||||
raise ImportError("pyarrow >= 0.17 required for Feather support")
|
||||
|
||||
path = _expand_user(path)
|
||||
|
||||
table = feather.read_table(path, columns=columns, **kwargs)
|
||||
return _arrow_to_geopandas(table)
|
||||
|
||||
|
||||
def _get_parquet_bbox_filter(geo_metadata, bbox):
|
||||
primary_column = geo_metadata["primary_column"]
|
||||
|
||||
if _check_if_covering_in_geo_metadata(geo_metadata):
|
||||
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
|
||||
return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
|
||||
|
||||
elif geo_metadata["columns"][primary_column]["encoding"] == "point":
|
||||
import pyarrow.compute as pc
|
||||
|
||||
return (
|
||||
(pc.field((primary_column, "x")) >= bbox[0])
|
||||
& (pc.field((primary_column, "x")) <= bbox[2])
|
||||
& (pc.field((primary_column, "y")) >= bbox[1])
|
||||
& (pc.field((primary_column, "y")) <= bbox[3])
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Specifying 'bbox' not supported for this Parquet file (it should either "
|
||||
"have a bbox covering column or use 'point' encoding)."
|
||||
)
|
||||
|
||||
|
||||
def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
|
||||
import pyarrow.compute as pc
|
||||
|
||||
return ~(
|
||||
(pc.field((bbox_column_name, "xmin")) > bbox[2])
|
||||
| (pc.field((bbox_column_name, "ymin")) > bbox[3])
|
||||
| (pc.field((bbox_column_name, "xmax")) < bbox[0])
|
||||
| (pc.field((bbox_column_name, "ymax")) < bbox[1])
|
||||
)
|
||||
|
||||
|
||||
def _check_if_covering_in_geo_metadata(geo_metadata):
|
||||
primary_column = geo_metadata["primary_column"]
|
||||
return "covering" in geo_metadata["columns"][primary_column].keys()
|
||||
|
||||
|
||||
def _get_bbox_encoding_column_name(geo_metadata):
|
||||
primary_column = geo_metadata["primary_column"]
|
||||
return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
|
||||
|
||||
|
||||
def _get_non_bbox_columns(schema, geo_metadata):
|
||||
|
||||
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
|
||||
columns = schema.names
|
||||
if bbox_column_name in columns:
|
||||
columns.remove(bbox_column_name)
|
||||
return columns
|
||||
|
||||
|
||||
def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
|
||||
parquet = import_optional_dependency(
|
||||
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
|
||||
)
|
||||
if bbox_filter is None:
|
||||
return kwarg_filters
|
||||
|
||||
filters_expression = parquet.filters_to_expression(kwarg_filters)
|
||||
return bbox_filter & filters_expression
|
||||
@@ -1,851 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
import warnings
|
||||
from io import IOBase
|
||||
from packaging.version import Version
|
||||
from pathlib import Path
|
||||
|
||||
# Adapted from pandas.io.common
|
||||
from urllib.parse import urlparse as parse_url
|
||||
from urllib.parse import uses_netloc, uses_params, uses_relative
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_integer_dtype
|
||||
|
||||
import shapely
|
||||
from shapely.geometry import mapping
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
|
||||
from geopandas import GeoDataFrame, GeoSeries
|
||||
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
|
||||
from geopandas.io.util import vsi_path
|
||||
|
||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||
_VALID_URLS.discard("")
|
||||
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
|
||||
_VALID_URLS.discard("file")
|
||||
|
||||
fiona = None
|
||||
fiona_env = None
|
||||
fiona_import_error = None
|
||||
FIONA_GE_19 = False
|
||||
|
||||
|
||||
def _import_fiona():
|
||||
global fiona
|
||||
global fiona_env
|
||||
global fiona_import_error
|
||||
global FIONA_GE_19
|
||||
|
||||
if fiona is None:
|
||||
try:
|
||||
import fiona
|
||||
|
||||
# only try to import fiona.Env if the main fiona import succeeded
|
||||
# (otherwise you can get confusing "AttributeError: module 'fiona'
|
||||
# has no attribute '_loading'" / partially initialized module errors)
|
||||
try:
|
||||
from fiona import Env as fiona_env
|
||||
except ImportError:
|
||||
try:
|
||||
from fiona import drivers as fiona_env
|
||||
except ImportError:
|
||||
fiona_env = None
|
||||
|
||||
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
|
||||
"1.9.0"
|
||||
)
|
||||
|
||||
except ImportError as err:
|
||||
fiona = False
|
||||
fiona_import_error = str(err)
|
||||
|
||||
|
||||
pyogrio = None
|
||||
pyogrio_import_error = None
|
||||
|
||||
|
||||
def _import_pyogrio():
|
||||
global pyogrio
|
||||
global pyogrio_import_error
|
||||
|
||||
if pyogrio is None:
|
||||
try:
|
||||
import pyogrio
|
||||
|
||||
except ImportError as err:
|
||||
pyogrio = False
|
||||
pyogrio_import_error = str(err)
|
||||
|
||||
|
||||
def _check_fiona(func):
|
||||
if not fiona:
|
||||
raise ImportError(
|
||||
f"the {func} requires the 'fiona' package, but it is not installed or does "
|
||||
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
|
||||
)
|
||||
|
||||
|
||||
def _check_pyogrio(func):
|
||||
if not pyogrio:
|
||||
raise ImportError(
|
||||
f"the {func} requires the 'pyogrio' package, but it is not installed "
|
||||
"or does not import correctly."
|
||||
"\nImporting pyogrio resulted in: {pyogrio_import_error}"
|
||||
)
|
||||
|
||||
|
||||
def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
|
||||
if metadata is None:
|
||||
return
|
||||
if driver != "GPKG":
|
||||
raise NotImplementedError(
|
||||
"The 'metadata' keyword is only supported for the GPKG driver."
|
||||
)
|
||||
|
||||
if engine == "fiona" and not FIONA_GE_19:
|
||||
raise NotImplementedError(
|
||||
"The 'metadata' keyword is only supported for Fiona >= 1.9."
|
||||
)
|
||||
|
||||
|
||||
def _check_engine(engine, func):
|
||||
# if not specified through keyword or option, then default to "pyogrio" if
|
||||
# installed, otherwise try fiona
|
||||
if engine is None:
|
||||
import geopandas
|
||||
|
||||
engine = geopandas.options.io_engine
|
||||
|
||||
if engine is None:
|
||||
_import_pyogrio()
|
||||
if pyogrio:
|
||||
engine = "pyogrio"
|
||||
else:
|
||||
_import_fiona()
|
||||
if fiona:
|
||||
engine = "fiona"
|
||||
|
||||
if engine == "pyogrio":
|
||||
_import_pyogrio()
|
||||
_check_pyogrio(func)
|
||||
elif engine == "fiona":
|
||||
_import_fiona()
|
||||
_check_fiona(func)
|
||||
elif engine is None:
|
||||
raise ImportError(
|
||||
f"The {func} requires the 'pyogrio' or 'fiona' package, "
|
||||
"but neither is installed or imports correctly."
|
||||
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
|
||||
f"\nImporting fiona resulted in: {fiona_import_error}"
|
||||
)
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
_EXTENSION_TO_DRIVER = {
|
||||
".bna": "BNA",
|
||||
".dxf": "DXF",
|
||||
".csv": "CSV",
|
||||
".shp": "ESRI Shapefile",
|
||||
".dbf": "ESRI Shapefile",
|
||||
".json": "GeoJSON",
|
||||
".geojson": "GeoJSON",
|
||||
".geojsonl": "GeoJSONSeq",
|
||||
".geojsons": "GeoJSONSeq",
|
||||
".gpkg": "GPKG",
|
||||
".gml": "GML",
|
||||
".xml": "GML",
|
||||
".gpx": "GPX",
|
||||
".gtm": "GPSTrackMaker",
|
||||
".gtz": "GPSTrackMaker",
|
||||
".tab": "MapInfo File",
|
||||
".mif": "MapInfo File",
|
||||
".mid": "MapInfo File",
|
||||
".dgn": "DGN",
|
||||
".fgb": "FlatGeobuf",
|
||||
}
|
||||
|
||||
|
||||
def _expand_user(path):
|
||||
"""Expand paths that use ~."""
|
||||
if isinstance(path, str):
|
||||
path = os.path.expanduser(path)
|
||||
elif isinstance(path, Path):
|
||||
path = path.expanduser()
|
||||
return path
|
||||
|
||||
|
||||
def _is_url(url):
|
||||
"""Check to see if *url* has a valid protocol."""
|
||||
try:
|
||||
return parse_url(url).scheme in _VALID_URLS
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _read_file(
|
||||
filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
|
||||
):
|
||||
"""
|
||||
Returns a GeoDataFrame from a file or URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str, path object or file-like object
|
||||
Either the absolute or relative path to the file or URL to
|
||||
be opened, or any object with a read() method (such as an open file
|
||||
or StringIO)
|
||||
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
|
||||
Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely
|
||||
geometry. With engine="fiona", CRS mis-matches are resolved if given a GeoSeries
|
||||
or GeoDataFrame. With engine="pyogrio", bbox must be in the same CRS as the
|
||||
dataset. Tuple is (minx, miny, maxx, maxy) to match the bounds property of
|
||||
shapely geometry objects. Cannot be used with mask.
|
||||
mask : dict | GeoDataFrame or GeoSeries | shapely Geometry, default None
|
||||
Filter for features that intersect with the given dict-like geojson
|
||||
geometry, GeoSeries, GeoDataFrame or shapely geometry.
|
||||
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
|
||||
Cannot be used with bbox. If multiple geometries are passed, this will
|
||||
first union all geometries, which may be computationally expensive.
|
||||
columns : list, optional
|
||||
List of column names to import from the data source. Column names
|
||||
must exactly match the names in the data source. To avoid reading
|
||||
any columns (besides the geometry column), pass an empty list-like.
|
||||
By default reads all columns.
|
||||
rows : int or slice, default None
|
||||
Load in specific rows by passing an integer (first `n` rows) or a
|
||||
slice() object.
|
||||
engine : str, "pyogrio" or "fiona"
|
||||
The underlying library that is used to read the file. Currently, the
|
||||
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
|
||||
installed, otherwise tries "fiona". Engine can also be set globally
|
||||
with the ``geopandas.options.io_engine`` option.
|
||||
**kwargs :
|
||||
Keyword args to be passed to the engine, and can be used to write
|
||||
to multi-layer data, store data within archives (zip files), etc.
|
||||
In case of the "pyogrio" engine, the keyword arguments are passed to
|
||||
`pyogrio.write_dataframe`. In case of the "fiona" engine, the keyword
|
||||
arguments are passed to fiona.open`. For more information on possible
|
||||
keywords, type: ``import pyogrio; help(pyogrio.write_dataframe)``.
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = geopandas.read_file("nybb.shp") # doctest: +SKIP
|
||||
|
||||
Specifying layer of GPKG:
|
||||
|
||||
>>> df = geopandas.read_file("file.gpkg", layer='cities') # doctest: +SKIP
|
||||
|
||||
Reading only first 10 rows:
|
||||
|
||||
>>> df = geopandas.read_file("nybb.shp", rows=10) # doctest: +SKIP
|
||||
|
||||
Reading only geometries intersecting ``mask``:
|
||||
|
||||
>>> df = geopandas.read_file("nybb.shp", mask=polygon) # doctest: +SKIP
|
||||
|
||||
Reading only geometries intersecting ``bbox``:
|
||||
|
||||
>>> df = geopandas.read_file("nybb.shp", bbox=(0, 0, 10, 20)) # doctest: +SKIP
|
||||
|
||||
Returns
|
||||
-------
|
||||
:obj:`geopandas.GeoDataFrame` or :obj:`pandas.DataFrame` :
|
||||
If `ignore_geometry=True` a :obj:`pandas.DataFrame` will be returned.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The format drivers will attempt to detect the encoding of your data, but
|
||||
may fail. In this case, the proper encoding can be specified explicitly
|
||||
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
|
||||
|
||||
When specifying a URL, geopandas will check if the server supports reading
|
||||
partial data and in that case pass the URL as is to the underlying engine,
|
||||
which will then use the network file system handler of GDAL to read from
|
||||
the URL. Otherwise geopandas will download the data from the URL and pass
|
||||
all data in-memory to the underlying engine.
|
||||
If you need more control over how the URL is read, you can specify the
|
||||
GDAL virtual filesystem manually (e.g. ``/vsicurl/https://...``). See the
|
||||
GDAL documentation on filesystems for more details
|
||||
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
|
||||
|
||||
"""
|
||||
engine = _check_engine(engine, "'read_file' function")
|
||||
|
||||
filename = _expand_user(filename)
|
||||
|
||||
from_bytes = False
|
||||
if _is_url(filename):
|
||||
# if it is a url that supports random access -> pass through to
|
||||
# pyogrio/fiona as is (to support downloading only part of the file)
|
||||
# otherwise still download manually because pyogrio/fiona don't support
|
||||
# all types of urls (https://github.com/geopandas/geopandas/issues/2908)
|
||||
with urllib.request.urlopen(filename) as response:
|
||||
if not response.headers.get("Accept-Ranges") == "bytes":
|
||||
filename = response.read()
|
||||
from_bytes = True
|
||||
|
||||
if engine == "pyogrio":
|
||||
return _read_file_pyogrio(
|
||||
filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
|
||||
)
|
||||
|
||||
elif engine == "fiona":
|
||||
if pd.api.types.is_file_like(filename):
|
||||
data = filename.read()
|
||||
path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
|
||||
from_bytes = True
|
||||
else:
|
||||
path_or_bytes = filename
|
||||
|
||||
return _read_file_fiona(
|
||||
path_or_bytes,
|
||||
from_bytes,
|
||||
bbox=bbox,
|
||||
mask=mask,
|
||||
columns=columns,
|
||||
rows=rows,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"unknown engine '{engine}'")
|
||||
|
||||
|
||||
def _read_file_fiona(
|
||||
path_or_bytes,
|
||||
from_bytes,
|
||||
bbox=None,
|
||||
mask=None,
|
||||
columns=None,
|
||||
rows=None,
|
||||
where=None,
|
||||
**kwargs,
|
||||
):
|
||||
if where is not None and not FIONA_GE_19:
|
||||
raise NotImplementedError("where requires fiona 1.9+")
|
||||
|
||||
if columns is not None:
|
||||
if "include_fields" in kwargs:
|
||||
raise ValueError(
|
||||
"Cannot specify both 'include_fields' and 'columns' keywords"
|
||||
)
|
||||
if not FIONA_GE_19:
|
||||
raise NotImplementedError("'columns' keyword requires fiona 1.9+")
|
||||
kwargs["include_fields"] = columns
|
||||
elif "include_fields" in kwargs:
|
||||
# alias to columns, as this variable is used below to specify column order
|
||||
# in the dataframe creation
|
||||
columns = kwargs["include_fields"]
|
||||
|
||||
if not from_bytes:
|
||||
# Opening a file via URL or file-like-object above automatically detects a
|
||||
# zipped file. In order to match that behavior, attempt to add a zip scheme
|
||||
# if missing.
|
||||
path_or_bytes = vsi_path(str(path_or_bytes))
|
||||
|
||||
if from_bytes:
|
||||
reader = fiona.BytesCollection
|
||||
else:
|
||||
reader = fiona.open
|
||||
|
||||
with fiona_env():
|
||||
with reader(path_or_bytes, **kwargs) as features:
|
||||
crs = features.crs_wkt
|
||||
# attempt to get EPSG code
|
||||
try:
|
||||
# fiona 1.9+
|
||||
epsg = features.crs.to_epsg(confidence_threshold=100)
|
||||
if epsg is not None:
|
||||
crs = epsg
|
||||
except AttributeError:
|
||||
# fiona <= 1.8
|
||||
try:
|
||||
crs = features.crs["init"]
|
||||
except (TypeError, KeyError):
|
||||
pass
|
||||
|
||||
# handle loading the bounding box
|
||||
if bbox is not None:
|
||||
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
|
||||
bbox = tuple(bbox.to_crs(crs).total_bounds)
|
||||
elif isinstance(bbox, BaseGeometry):
|
||||
bbox = bbox.bounds
|
||||
assert len(bbox) == 4
|
||||
# handle loading the mask
|
||||
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
|
||||
mask = mapping(mask.to_crs(crs).union_all())
|
||||
elif isinstance(mask, BaseGeometry):
|
||||
mask = mapping(mask)
|
||||
|
||||
filters = {}
|
||||
if bbox is not None:
|
||||
filters["bbox"] = bbox
|
||||
if mask is not None:
|
||||
filters["mask"] = mask
|
||||
if where is not None:
|
||||
filters["where"] = where
|
||||
|
||||
# setup the data loading filter
|
||||
if rows is not None:
|
||||
if isinstance(rows, int):
|
||||
rows = slice(rows)
|
||||
elif not isinstance(rows, slice):
|
||||
raise TypeError("'rows' must be an integer or a slice.")
|
||||
f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
|
||||
elif filters:
|
||||
f_filt = features.filter(**filters)
|
||||
else:
|
||||
f_filt = features
|
||||
# get list of columns
|
||||
columns = columns or list(features.schema["properties"])
|
||||
datetime_fields = [
|
||||
k for (k, v) in features.schema["properties"].items() if v == "datetime"
|
||||
]
|
||||
if (
|
||||
kwargs.get("ignore_geometry", False)
|
||||
or features.schema["geometry"] == "None"
|
||||
):
|
||||
df = pd.DataFrame(
|
||||
[record["properties"] for record in f_filt], columns=columns
|
||||
)
|
||||
else:
|
||||
df = GeoDataFrame.from_features(
|
||||
f_filt, crs=crs, columns=columns + ["geometry"]
|
||||
)
|
||||
for k in datetime_fields:
|
||||
as_dt = None
|
||||
# plain try catch for when pandas will raise in the future
|
||||
# TODO we can tighten the exception type in future when it does
|
||||
try:
|
||||
with warnings.catch_warnings():
|
||||
# pandas 2.x does not yet enforce this behaviour but raises a
|
||||
# warning -> we want to to suppress this warning for our users,
|
||||
# and do this by turning it into an error so we take the
|
||||
# `except` code path to try again with utc=True
|
||||
warnings.filterwarnings(
|
||||
"error",
|
||||
"In a future version of pandas, parsing datetimes with "
|
||||
"mixed time zones will raise an error",
|
||||
FutureWarning,
|
||||
)
|
||||
as_dt = pd.to_datetime(df[k])
|
||||
except Exception:
|
||||
pass
|
||||
if as_dt is None or as_dt.dtype == "object":
|
||||
# if to_datetime failed, try again for mixed timezone offsets
|
||||
# This can still fail if there are invalid datetimes
|
||||
try:
|
||||
as_dt = pd.to_datetime(df[k], utc=True)
|
||||
except Exception:
|
||||
pass
|
||||
# if to_datetime succeeded, round datetimes as
|
||||
# fiona only supports up to ms precision (any microseconds are
|
||||
# floating point rounding error)
|
||||
if as_dt is not None and not (as_dt.dtype == "object"):
|
||||
if PANDAS_GE_20:
|
||||
df[k] = as_dt.dt.as_unit("ms")
|
||||
else:
|
||||
df[k] = as_dt.dt.round(freq="ms")
|
||||
return df
|
||||
|
||||
|
||||
def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
|
||||
import pyogrio
|
||||
|
||||
if rows is not None:
|
||||
if isinstance(rows, int):
|
||||
kwargs["max_features"] = rows
|
||||
elif isinstance(rows, slice):
|
||||
if rows.start is not None:
|
||||
if rows.start < 0:
|
||||
raise ValueError(
|
||||
"Negative slice start not supported with the 'pyogrio' engine."
|
||||
)
|
||||
kwargs["skip_features"] = rows.start
|
||||
if rows.stop is not None:
|
||||
kwargs["max_features"] = rows.stop - (rows.start or 0)
|
||||
if rows.step is not None:
|
||||
raise ValueError("slice with step is not supported")
|
||||
else:
|
||||
raise TypeError("'rows' must be an integer or a slice.")
|
||||
|
||||
if bbox is not None and mask is not None:
|
||||
# match error message from Fiona
|
||||
raise ValueError("mask and bbox can not be set together")
|
||||
|
||||
if bbox is not None:
|
||||
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
|
||||
crs = pyogrio.read_info(path_or_bytes).get("crs")
|
||||
if isinstance(path_or_bytes, IOBase):
|
||||
path_or_bytes.seek(0)
|
||||
|
||||
bbox = tuple(bbox.to_crs(crs).total_bounds)
|
||||
elif isinstance(bbox, BaseGeometry):
|
||||
bbox = bbox.bounds
|
||||
if len(bbox) != 4:
|
||||
raise ValueError("'bbox' should be a length-4 tuple.")
|
||||
|
||||
if mask is not None:
|
||||
# NOTE: mask cannot be used at same time as bbox keyword
|
||||
if isinstance(mask, (GeoDataFrame, GeoSeries)):
|
||||
crs = pyogrio.read_info(path_or_bytes).get("crs")
|
||||
if isinstance(path_or_bytes, IOBase):
|
||||
path_or_bytes.seek(0)
|
||||
|
||||
mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
|
||||
elif isinstance(mask, BaseGeometry):
|
||||
mask = shapely.unary_union(mask)
|
||||
elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
|
||||
# convert GeoJSON to shapely geometry
|
||||
mask = shapely.geometry.shape(mask)
|
||||
|
||||
kwargs["mask"] = mask
|
||||
|
||||
if kwargs.pop("ignore_geometry", False):
|
||||
kwargs["read_geometry"] = False
|
||||
|
||||
# translate `ignore_fields`/`include_fields` keyword for back compat with fiona
|
||||
if "ignore_fields" in kwargs and "include_fields" in kwargs:
|
||||
raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
|
||||
elif "ignore_fields" in kwargs:
|
||||
if kwargs.get("columns", None) is not None:
|
||||
raise ValueError(
|
||||
"Cannot specify both 'columns' and 'ignore_fields' keywords"
|
||||
)
|
||||
warnings.warn(
|
||||
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
|
||||
"will be removed in a future release. You can use the 'columns' keyword "
|
||||
"instead to select which columns to read.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
ignore_fields = kwargs.pop("ignore_fields")
|
||||
fields = pyogrio.read_info(path_or_bytes)["fields"]
|
||||
include_fields = [col for col in fields if col not in ignore_fields]
|
||||
kwargs["columns"] = include_fields
|
||||
elif "include_fields" in kwargs:
|
||||
# translate `include_fields` keyword for back compat with fiona engine
|
||||
if kwargs.get("columns", None) is not None:
|
||||
raise ValueError(
|
||||
"Cannot specify both 'columns' and 'include_fields' keywords"
|
||||
)
|
||||
warnings.warn(
|
||||
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
|
||||
"will be removed in a future release. You can use the 'columns' keyword "
|
||||
"instead to select which columns to read.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
kwargs["columns"] = kwargs.pop("include_fields")
|
||||
|
||||
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
|
||||
|
||||
|
||||
def _detect_driver(path):
|
||||
"""
|
||||
Attempt to auto-detect driver based on the extension
|
||||
"""
|
||||
try:
|
||||
# in case the path is a file handle
|
||||
path = path.name
|
||||
except AttributeError:
|
||||
pass
|
||||
try:
|
||||
return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
|
||||
except KeyError:
|
||||
# Assume it is a shapefile folder for now. In the future,
|
||||
# will likely raise an exception when the expected
|
||||
# folder writing behavior is more clearly defined.
|
||||
return "ESRI Shapefile"
|
||||
|
||||
|
||||
def _to_file(
|
||||
df,
|
||||
filename,
|
||||
driver=None,
|
||||
schema=None,
|
||||
index=None,
|
||||
mode="w",
|
||||
crs=None,
|
||||
engine=None,
|
||||
metadata=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Write this GeoDataFrame to an OGR data source
|
||||
|
||||
A dictionary of supported OGR providers is available via:
|
||||
|
||||
>>> import pyogrio
|
||||
>>> pyogrio.list_drivers() # doctest: +SKIP
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame to be written
|
||||
filename : string
|
||||
File path or file handle to write to. The path may specify a
|
||||
GDAL VSI scheme.
|
||||
driver : string, default None
|
||||
The OGR format driver used to write the vector file.
|
||||
If not specified, it attempts to infer it from the file extension.
|
||||
If no extension is specified, it saves ESRI Shapefile to a folder.
|
||||
schema : dict, default None
|
||||
If specified, the schema dictionary is passed to Fiona to
|
||||
better control how the file is written. If None, GeoPandas
|
||||
will determine the schema based on each column's dtype.
|
||||
Not supported for the "pyogrio" engine.
|
||||
index : bool, default None
|
||||
If True, write index into one or more columns (for MultiIndex).
|
||||
Default None writes the index into one or more columns only if
|
||||
the index is named, is a MultiIndex, or has a non-integer data
|
||||
type. If False, no index is written.
|
||||
|
||||
.. versionadded:: 0.7
|
||||
Previously the index was not written.
|
||||
mode : string, default 'w'
|
||||
The write mode, 'w' to overwrite the existing file and 'a' to append;
|
||||
when using the pyogrio engine, you can also pass ``append=True``.
|
||||
Not all drivers support appending. For the fiona engine, the drivers
|
||||
that support appending are listed in fiona.supported_drivers or
|
||||
https://github.com/Toblerity/Fiona/blob/master/fiona/drvsupport.py.
|
||||
For the pyogrio engine, you should be able to use any driver that
|
||||
is available in your installation of GDAL that supports append
|
||||
capability; see the specific driver entry at
|
||||
https://gdal.org/drivers/vector/index.html for more information.
|
||||
crs : pyproj.CRS, default None
|
||||
If specified, the CRS is passed to Fiona to
|
||||
better control how the file is written. If None, GeoPandas
|
||||
will determine the crs based on crs df attribute.
|
||||
The value can be anything accepted
|
||||
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
|
||||
such as an authority string (eg "EPSG:4326") or a WKT string.
|
||||
engine : str, "pyogrio" or "fiona"
|
||||
The underlying library that is used to read the file. Currently, the
|
||||
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
|
||||
installed, otherwise tries "fiona". Engine can also be set globally
|
||||
with the ``geopandas.options.io_engine`` option.
|
||||
metadata : dict[str, str], default None
|
||||
Optional metadata to be stored in the file. Keys and values must be
|
||||
strings. Only supported for the "GPKG" driver
|
||||
(requires Fiona >= 1.9 or pyogrio >= 0.6).
|
||||
**kwargs :
|
||||
Keyword args to be passed to the engine, and can be used to write
|
||||
to multi-layer data, store data within archives (zip files), etc.
|
||||
In case of the "fiona" engine, the keyword arguments are passed to
|
||||
fiona.open`. For more information on possible keywords, type:
|
||||
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
|
||||
the keyword arguments are passed to `pyogrio.write_dataframe`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The format drivers will attempt to detect the encoding of your data, but
|
||||
may fail. In this case, the proper encoding can be specified explicitly
|
||||
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
|
||||
"""
|
||||
engine = _check_engine(engine, "'to_file' method")
|
||||
|
||||
filename = _expand_user(filename)
|
||||
|
||||
if index is None:
|
||||
# Determine if index attribute(s) should be saved to file
|
||||
# (only if they are named or are non-integer)
|
||||
index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
|
||||
if index:
|
||||
df = df.reset_index(drop=False)
|
||||
|
||||
if driver is None:
|
||||
driver = _detect_driver(filename)
|
||||
|
||||
if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
|
||||
warnings.warn(
|
||||
"Column names longer than 10 characters will be truncated when saved to "
|
||||
"ESRI Shapefile.",
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
if (df.dtypes == "geometry").sum() > 1:
|
||||
raise ValueError(
|
||||
"GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
|
||||
"supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
|
||||
"GeoDataFrame.to_feather, drop additional geometry columns or convert them "
|
||||
"to a supported format like a well-known text (WKT) using "
|
||||
"`GeoSeries.to_wkt()`.",
|
||||
)
|
||||
_check_metadata_supported(metadata, engine, driver)
|
||||
|
||||
if mode not in ("w", "a"):
|
||||
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
|
||||
|
||||
if engine == "pyogrio":
|
||||
_to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
|
||||
elif engine == "fiona":
|
||||
_to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"unknown engine '{engine}'")
|
||||
|
||||
|
||||
def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
|
||||
if not HAS_PYPROJ and crs:
|
||||
raise ImportError(
|
||||
"The 'pyproj' package is required to write a file with a CRS, but it is not"
|
||||
" installed or does not import correctly."
|
||||
)
|
||||
|
||||
if schema is None:
|
||||
schema = infer_schema(df)
|
||||
|
||||
if crs:
|
||||
from pyproj import CRS
|
||||
|
||||
crs = CRS.from_user_input(crs)
|
||||
else:
|
||||
crs = df.crs
|
||||
|
||||
with fiona_env():
|
||||
crs_wkt = None
|
||||
try:
|
||||
gdal_version = Version(
|
||||
fiona.env.get_gdal_release_name().strip("e")
|
||||
) # GH3147
|
||||
except (AttributeError, ValueError):
|
||||
gdal_version = Version("2.0.0") # just assume it is not the latest
|
||||
if gdal_version >= Version("3.0.0") and crs:
|
||||
crs_wkt = crs.to_wkt()
|
||||
elif crs:
|
||||
crs_wkt = crs.to_wkt("WKT1_GDAL")
|
||||
with fiona.open(
|
||||
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
|
||||
) as colxn:
|
||||
if metadata is not None:
|
||||
colxn.update_tags(metadata)
|
||||
colxn.writerecords(df.iterfeatures())
|
||||
|
||||
|
||||
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
|
||||
import pyogrio
|
||||
|
||||
if schema is not None:
|
||||
raise ValueError(
|
||||
"The 'schema' argument is not supported with the 'pyogrio' engine."
|
||||
)
|
||||
|
||||
if mode == "a":
|
||||
kwargs["append"] = True
|
||||
|
||||
if crs is not None:
|
||||
raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
|
||||
|
||||
# for the fiona engine, this check is done in gdf.iterfeatures()
|
||||
if not df.columns.is_unique:
|
||||
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
|
||||
|
||||
pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
|
||||
|
||||
|
||||
def infer_schema(df):
|
||||
from collections import OrderedDict
|
||||
|
||||
# TODO: test pandas string type and boolean type once released
|
||||
types = {
|
||||
"Int32": "int32",
|
||||
"int32": "int32",
|
||||
"Int64": "int",
|
||||
"string": "str",
|
||||
"boolean": "bool",
|
||||
}
|
||||
|
||||
def convert_type(column, in_type):
|
||||
if in_type == object:
|
||||
return "str"
|
||||
if in_type.name.startswith("datetime64"):
|
||||
# numpy datetime type regardless of frequency
|
||||
return "datetime"
|
||||
if str(in_type) in types:
|
||||
out_type = types[str(in_type)]
|
||||
else:
|
||||
out_type = type(np.zeros(1, in_type).item()).__name__
|
||||
if out_type == "long":
|
||||
out_type = "int"
|
||||
return out_type
|
||||
|
||||
properties = OrderedDict(
|
||||
[
|
||||
(col, convert_type(col, _type))
|
||||
for col, _type in zip(df.columns, df.dtypes)
|
||||
if col != df._geometry_column_name
|
||||
]
|
||||
)
|
||||
|
||||
if df.empty:
|
||||
warnings.warn(
|
||||
"You are attempting to write an empty DataFrame to file. "
|
||||
"For some drivers, this operation may fail.",
|
||||
UserWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
# Since https://github.com/Toblerity/Fiona/issues/446 resolution,
|
||||
# Fiona allows a list of geometry types
|
||||
geom_types = _geometry_types(df)
|
||||
|
||||
schema = {"geometry": geom_types, "properties": properties}
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
def _geometry_types(df):
|
||||
"""
|
||||
Determine the geometry types in the GeoDataFrame for the schema.
|
||||
"""
|
||||
geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
|
||||
geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
|
||||
geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
|
||||
geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
|
||||
geom_types = geom_types_3D + geom_types_2D
|
||||
|
||||
if len(geom_types) == 0:
|
||||
# Default geometry type supported by Fiona
|
||||
# (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
|
||||
return "Unknown"
|
||||
|
||||
if len(geom_types) == 1:
|
||||
geom_types = geom_types[0]
|
||||
|
||||
return geom_types
|
||||
|
||||
|
||||
def _list_layers(filename) -> pd.DataFrame:
|
||||
"""List layers available in a file.
|
||||
|
||||
Provides an overview of layers available in a file or URL together with their
|
||||
geometry types. When supported by the data source, this includes both spatial and
|
||||
non-spatial layers. Non-spatial layers are indicated by the ``"geometry_type"``
|
||||
column being ``None``. GeoPandas will not read such layers but they can be read into
|
||||
a pd.DataFrame using :func:`pyogrio.read_dataframe`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str, path object or file-like object
|
||||
Either the absolute or relative path to the file or URL to
|
||||
be opened, or any object with a read() method (such as an open file
|
||||
or StringIO)
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
A DataFrame with columns "name" and "geometry_type" and one row per layer.
|
||||
"""
|
||||
_import_pyogrio()
|
||||
_check_pyogrio("list_layers")
|
||||
|
||||
import pyogrio
|
||||
|
||||
return pd.DataFrame(
|
||||
pyogrio.list_layers(filename), columns=["name", "geometry_type"]
|
||||
)
|
||||
@@ -1,473 +0,0 @@
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from functools import lru_cache
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import shapely
|
||||
import shapely.wkb
|
||||
|
||||
from geopandas import GeoDataFrame
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _get_conn(conn_or_engine):
|
||||
"""
|
||||
Yield a connection within a transaction context.
|
||||
|
||||
Engine.begin() returns a Connection with an implicit Transaction while
|
||||
Connection.begin() returns the Transaction. This helper will always return a
|
||||
Connection with an implicit (possibly nested) Transaction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
conn_or_engine : Connection or Engine
|
||||
A sqlalchemy Connection or Engine instance
|
||||
Returns
|
||||
-------
|
||||
Connection
|
||||
"""
|
||||
from sqlalchemy.engine.base import Connection, Engine
|
||||
|
||||
if isinstance(conn_or_engine, Connection):
|
||||
if not conn_or_engine.in_transaction():
|
||||
with conn_or_engine.begin():
|
||||
yield conn_or_engine
|
||||
else:
|
||||
yield conn_or_engine
|
||||
elif isinstance(conn_or_engine, Engine):
|
||||
with conn_or_engine.begin() as conn:
|
||||
yield conn
|
||||
else:
|
||||
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
|
||||
|
||||
|
||||
def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
|
||||
"""
|
||||
Transforms a pandas DataFrame into a GeoDataFrame.
|
||||
The column 'geom_col' must be a geometry column in WKB representation.
|
||||
To be used to convert df based on pd.read_sql to gdf.
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
pandas DataFrame with geometry column in WKB representation.
|
||||
geom_col : string, default 'geom'
|
||||
column name to convert to shapely geometries
|
||||
crs : pyproj.CRS, optional
|
||||
CRS to use for the returned GeoDataFrame. The value can be anything accepted
|
||||
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
|
||||
such as an authority string (eg "EPSG:4326") or a WKT string.
|
||||
If not set, tries to determine CRS from the SRID associated with the
|
||||
first geometry in the database, and assigns that to all geometries.
|
||||
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
|
||||
Active connection to the database to query.
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
"""
|
||||
|
||||
if geom_col not in df:
|
||||
raise ValueError("Query missing geometry column '{}'".format(geom_col))
|
||||
|
||||
if df.columns.to_list().count(geom_col) > 1:
|
||||
raise ValueError(
|
||||
f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
|
||||
"one geometry column is allowed."
|
||||
)
|
||||
|
||||
geoms = df[geom_col].dropna()
|
||||
|
||||
if not geoms.empty:
|
||||
load_geom_bytes = shapely.wkb.loads
|
||||
"""Load from Python 3 binary."""
|
||||
|
||||
def load_geom_text(x):
|
||||
"""Load from binary encoded as text."""
|
||||
return shapely.wkb.loads(str(x), hex=True)
|
||||
|
||||
if isinstance(geoms.iat[0], bytes):
|
||||
load_geom = load_geom_bytes
|
||||
else:
|
||||
load_geom = load_geom_text
|
||||
|
||||
df[geom_col] = geoms = geoms.apply(load_geom)
|
||||
if crs is None:
|
||||
srid = shapely.get_srid(geoms.iat[0])
|
||||
# if no defined SRID in geodatabase, returns SRID of 0
|
||||
if srid != 0:
|
||||
try:
|
||||
spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
|
||||
except pd.errors.DatabaseError:
|
||||
warning_msg = (
|
||||
f"Could not find the spatial reference system table "
|
||||
f"(spatial_ref_sys) in PostGIS."
|
||||
f"Trying epsg:{srid} as a fallback."
|
||||
)
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=3)
|
||||
crs = "epsg:{}".format(srid)
|
||||
else:
|
||||
if not spatial_ref_sys_df.empty:
|
||||
auth_name = spatial_ref_sys_df["auth_name"].item()
|
||||
crs = f"{auth_name}:{srid}"
|
||||
else:
|
||||
warning_msg = (
|
||||
f"Could not find srid {srid} in the "
|
||||
f"spatial_ref_sys table. "
|
||||
f"Trying epsg:{srid} as a fallback."
|
||||
)
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=3)
|
||||
crs = "epsg:{}".format(srid)
|
||||
|
||||
return GeoDataFrame(df, crs=crs, geometry=geom_col)
|
||||
|
||||
|
||||
def _read_postgis(
|
||||
sql,
|
||||
con,
|
||||
geom_col="geom",
|
||||
crs=None,
|
||||
index_col=None,
|
||||
coerce_float=True,
|
||||
parse_dates=None,
|
||||
params=None,
|
||||
chunksize=None,
|
||||
):
|
||||
"""
|
||||
Returns a GeoDataFrame corresponding to the result of the query
|
||||
string, which must contain a geometry column in WKB representation.
|
||||
|
||||
It is also possible to use :meth:`~GeoDataFrame.read_file` to read from a database.
|
||||
Especially for file geodatabases like GeoPackage or SpatiaLite this can be easier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sql : string
|
||||
SQL query to execute in selecting entries from database, or name
|
||||
of the table to read from the database.
|
||||
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
|
||||
Active connection to the database to query.
|
||||
geom_col : string, default 'geom'
|
||||
column name to convert to shapely geometries
|
||||
crs : dict or str, optional
|
||||
CRS to use for the returned GeoDataFrame; if not set, tries to
|
||||
determine CRS from the SRID associated with the first geometry in
|
||||
the database, and assigns that to all geometries.
|
||||
chunksize : int, default None
|
||||
If specified, return an iterator where chunksize is the number of rows to
|
||||
include in each chunk.
|
||||
|
||||
See the documentation for pandas.read_sql for further explanation
|
||||
of the following parameters:
|
||||
index_col, coerce_float, parse_dates, params, chunksize
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
PostGIS
|
||||
|
||||
>>> from sqlalchemy import create_engine # doctest: +SKIP
|
||||
>>> db_connection_url = "postgresql://myusername:mypassword@myhost:5432/mydatabase"
|
||||
>>> con = create_engine(db_connection_url) # doctest: +SKIP
|
||||
>>> sql = "SELECT geom, highway FROM roads"
|
||||
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
|
||||
|
||||
SpatiaLite
|
||||
|
||||
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
|
||||
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
|
||||
"""
|
||||
|
||||
if chunksize is None:
|
||||
# read all in one chunk and return a single GeoDataFrame
|
||||
df = pd.read_sql(
|
||||
sql,
|
||||
con,
|
||||
index_col=index_col,
|
||||
coerce_float=coerce_float,
|
||||
parse_dates=parse_dates,
|
||||
params=params,
|
||||
chunksize=chunksize,
|
||||
)
|
||||
return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
|
||||
|
||||
else:
|
||||
# read data in chunks and return a generator
|
||||
df_generator = pd.read_sql(
|
||||
sql,
|
||||
con,
|
||||
index_col=index_col,
|
||||
coerce_float=coerce_float,
|
||||
parse_dates=parse_dates,
|
||||
params=params,
|
||||
chunksize=chunksize,
|
||||
)
|
||||
return (
|
||||
_df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
|
||||
)
|
||||
|
||||
|
||||
def _get_geometry_type(gdf):
|
||||
"""
|
||||
Get basic geometry type of a GeoDataFrame. See more info from:
|
||||
https://geoalchemy-2.readthedocs.io/en/latest/types.html#geoalchemy2.types._GISType
|
||||
|
||||
Following rules apply:
|
||||
- if geometries all share the same geometry-type,
|
||||
geometries are inserted with the given GeometryType with following types:
|
||||
- Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon,
|
||||
GeometryCollection.
|
||||
- LinearRing geometries will be converted into LineString -objects.
|
||||
- in all other cases, geometries will be inserted with type GEOMETRY:
|
||||
- a mix of Polygons and MultiPolygons in GeoSeries
|
||||
- a mix of Points and LineStrings in GeoSeries
|
||||
- geometry is of type GeometryCollection,
|
||||
such as GeometryCollection([Point, LineStrings])
|
||||
- if any of the geometries has Z-coordinate, all records will
|
||||
be written with 3D.
|
||||
"""
|
||||
geom_types = list(gdf.geometry.geom_type.unique())
|
||||
has_curve = False
|
||||
|
||||
for gt in geom_types:
|
||||
if gt is None:
|
||||
continue
|
||||
elif "LinearRing" in gt:
|
||||
has_curve = True
|
||||
|
||||
if len(geom_types) == 1:
|
||||
if has_curve:
|
||||
target_geom_type = "LINESTRING"
|
||||
else:
|
||||
if geom_types[0] is None:
|
||||
raise ValueError("No valid geometries in the data.")
|
||||
else:
|
||||
target_geom_type = geom_types[0].upper()
|
||||
else:
|
||||
target_geom_type = "GEOMETRY"
|
||||
|
||||
# Check for 3D-coordinates
|
||||
if any(gdf.geometry.has_z):
|
||||
target_geom_type += "Z"
|
||||
|
||||
return target_geom_type, has_curve
|
||||
|
||||
|
||||
def _get_srid_from_crs(gdf):
|
||||
"""
|
||||
Get EPSG code from CRS if available. If not, return 0.
|
||||
"""
|
||||
|
||||
# Use geoalchemy2 default for srid
|
||||
# Note: undefined srid in PostGIS is 0
|
||||
srid = None
|
||||
warning_msg = (
|
||||
"Could not parse CRS from the GeoDataFrame. "
|
||||
"Inserting data without defined CRS."
|
||||
)
|
||||
if gdf.crs is not None:
|
||||
try:
|
||||
for confidence in (100, 70, 25):
|
||||
srid = gdf.crs.to_epsg(min_confidence=confidence)
|
||||
if srid is not None:
|
||||
break
|
||||
auth_srid = gdf.crs.to_authority(
|
||||
auth_name="ESRI", min_confidence=confidence
|
||||
)
|
||||
if auth_srid is not None:
|
||||
srid = int(auth_srid[1])
|
||||
break
|
||||
except Exception:
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=2)
|
||||
|
||||
if srid is None:
|
||||
srid = 0
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=2)
|
||||
|
||||
return srid
|
||||
|
||||
|
||||
def _convert_linearring_to_linestring(gdf, geom_name):
|
||||
from shapely.geometry import LineString
|
||||
|
||||
# Todo: Use shapely function once it's implemented:
|
||||
# https://github.com/shapely/shapely/issues/1617
|
||||
|
||||
mask = gdf.geom_type == "LinearRing"
|
||||
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
|
||||
lambda geom: LineString(geom)
|
||||
)
|
||||
return gdf
|
||||
|
||||
|
||||
def _convert_to_ewkb(gdf, geom_name, srid):
|
||||
"""Convert geometries to ewkb."""
|
||||
geoms = shapely.to_wkb(
|
||||
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
|
||||
hex=True,
|
||||
include_srid=True,
|
||||
)
|
||||
|
||||
# The gdf will warn that the geometry column doesn't hold in-memory geometries
|
||||
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
|
||||
# the user that the dtypes are unexpected.
|
||||
df = pd.DataFrame(gdf, copy=False)
|
||||
df[geom_name] = geoms
|
||||
return df
|
||||
|
||||
|
||||
def _psql_insert_copy(tbl, conn, keys, data_iter):
|
||||
import csv
|
||||
import io
|
||||
|
||||
s_buf = io.StringIO()
|
||||
writer = csv.writer(s_buf)
|
||||
writer.writerows(data_iter)
|
||||
s_buf.seek(0)
|
||||
|
||||
columns = ", ".join('"{}"'.format(k) for k in keys)
|
||||
|
||||
dbapi_conn = conn.connection
|
||||
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
|
||||
tbl.table.schema, tbl.table.name, columns
|
||||
)
|
||||
with dbapi_conn.cursor() as cur:
|
||||
# Use psycopg method if it's available
|
||||
if hasattr(cur, "copy") and callable(cur.copy):
|
||||
with cur.copy(sql) as copy:
|
||||
copy.write(s_buf.read())
|
||||
else: # otherwise use psycopg2 method
|
||||
cur.copy_expert(sql, s_buf)
|
||||
|
||||
|
||||
def _write_postgis(
|
||||
gdf,
|
||||
name,
|
||||
con,
|
||||
schema=None,
|
||||
if_exists="fail",
|
||||
index=False,
|
||||
index_label=None,
|
||||
chunksize=None,
|
||||
dtype=None,
|
||||
):
|
||||
"""
|
||||
Upload GeoDataFrame into PostGIS database.
|
||||
|
||||
This method requires SQLAlchemy and GeoAlchemy2, and a PostgreSQL
|
||||
Python driver (e.g. psycopg2) to be installed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
Name of the target table.
|
||||
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
|
||||
Active connection to the PostGIS database.
|
||||
if_exists : {'fail', 'replace', 'append'}, default 'fail'
|
||||
How to behave if the table already exists:
|
||||
|
||||
- fail: Raise a ValueError.
|
||||
- replace: Drop the table before inserting new values.
|
||||
- append: Insert new values to the existing table.
|
||||
schema : string, optional
|
||||
Specify the schema. If None, use default schema: 'public'.
|
||||
index : bool, default True
|
||||
Write DataFrame index as a column.
|
||||
Uses *index_label* as the column name in the table.
|
||||
index_label : string or sequence, default None
|
||||
Column label for index column(s).
|
||||
If None is given (default) and index is True,
|
||||
then the index names are used.
|
||||
chunksize : int, optional
|
||||
Rows will be written in batches of this size at a time.
|
||||
By default, all rows will be written at once.
|
||||
dtype : dict of column name to SQL type, default None
|
||||
Specifying the datatype for columns.
|
||||
The keys should be the column names and the values
|
||||
should be the SQLAlchemy types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from sqlalchemy import create_engine # doctest: +SKIP
|
||||
>>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
|
||||
/mydatabase";) # doctest: +SKIP
|
||||
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
|
||||
"""
|
||||
try:
|
||||
from geoalchemy2 import Geometry
|
||||
from sqlalchemy import text
|
||||
except ImportError:
|
||||
raise ImportError("'to_postgis()' requires geoalchemy2 package.")
|
||||
|
||||
gdf = gdf.copy()
|
||||
geom_name = gdf.geometry.name
|
||||
|
||||
# Get srid
|
||||
srid = _get_srid_from_crs(gdf)
|
||||
|
||||
# Get geometry type and info whether data contains LinearRing.
|
||||
geometry_type, has_curve = _get_geometry_type(gdf)
|
||||
|
||||
# Build dtype with Geometry
|
||||
if dtype is not None:
|
||||
dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
|
||||
else:
|
||||
dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
|
||||
|
||||
# Convert LinearRing geometries to LineString
|
||||
if has_curve:
|
||||
gdf = _convert_linearring_to_linestring(gdf, geom_name)
|
||||
|
||||
# Convert geometries to EWKB
|
||||
gdf = _convert_to_ewkb(gdf, geom_name, srid)
|
||||
|
||||
if schema is not None:
|
||||
schema_name = schema
|
||||
else:
|
||||
schema_name = "public"
|
||||
|
||||
if if_exists == "append":
|
||||
# Check that the geometry srid matches with the current GeoDataFrame
|
||||
with _get_conn(con) as connection:
|
||||
# Only check SRID if table exists
|
||||
if connection.dialect.has_table(connection, name, schema):
|
||||
target_srid = connection.execute(
|
||||
text(
|
||||
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
|
||||
schema=schema_name, table=name, geom_col=geom_name
|
||||
)
|
||||
)
|
||||
).fetchone()[0]
|
||||
|
||||
if target_srid != srid:
|
||||
msg = (
|
||||
"The CRS of the target table (EPSG:{epsg_t}) differs from the "
|
||||
"CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
|
||||
epsg_t=target_srid, epsg_src=srid
|
||||
)
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
with _get_conn(con) as connection:
|
||||
gdf.to_sql(
|
||||
name,
|
||||
connection,
|
||||
schema=schema_name,
|
||||
if_exists=if_exists,
|
||||
index=index,
|
||||
index_label=index_label,
|
||||
chunksize=chunksize,
|
||||
dtype=dtype,
|
||||
method=_psql_insert_copy,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache
|
||||
def _get_spatial_ref_sys_df(con, srid):
|
||||
spatial_ref_sys_sql = (
|
||||
f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
|
||||
)
|
||||
return pd.read_sql(spatial_ref_sys_sql, con)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,100 +0,0 @@
|
||||
"""
|
||||
Script to create the data and write legacy storage (pickle) files.
|
||||
|
||||
Based on pandas' generate_legacy_storage_files.py script.
|
||||
|
||||
To use this script, create an environment for which you want to
|
||||
generate pickles, activate the environment, and run this script as:
|
||||
|
||||
$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py \
|
||||
geopandas/geopandas/io/tests/data/pickle/ pickle
|
||||
|
||||
This script generates a storage file for the current arch, system,
|
||||
|
||||
The idea here is you are using the *current* version of the
|
||||
generate_legacy_storage_files with an *older* version of geopandas to
|
||||
generate a pickle file. We will then check this file into a current
|
||||
branch, and test using test_pickle.py. This will load the *older*
|
||||
pickles and test versus the current data that is generated
|
||||
(with master). These are then compared.
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import platform
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from shapely.geometry import Point
|
||||
|
||||
import geopandas
|
||||
|
||||
|
||||
def create_pickle_data():
|
||||
"""create the pickle data"""
|
||||
|
||||
# custom geometry column name
|
||||
gdf_the_geom = geopandas.GeoDataFrame(
|
||||
{"a": [1, 2, 3], "the_geom": [Point(1, 1), Point(2, 2), Point(3, 3)]},
|
||||
geometry="the_geom",
|
||||
)
|
||||
|
||||
# with crs
|
||||
gdf_crs = geopandas.GeoDataFrame(
|
||||
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
|
||||
crs="EPSG:4326",
|
||||
)
|
||||
|
||||
return {"gdf_the_geom": gdf_the_geom, "gdf_crs": gdf_crs}
|
||||
|
||||
|
||||
def platform_name():
|
||||
return "_".join(
|
||||
[
|
||||
str(geopandas.__version__),
|
||||
"pd-" + str(pd.__version__),
|
||||
"py-" + str(platform.python_version()),
|
||||
str(platform.machine()),
|
||||
str(platform.system().lower()),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def write_legacy_pickles(output_dir):
|
||||
print(
|
||||
"This script generates a storage file for the current arch, system, "
|
||||
"and python version"
|
||||
)
|
||||
print("geopandas version: {}").format(geopandas.__version__)
|
||||
print(" output dir : {}".format(output_dir))
|
||||
print(" storage format: pickle")
|
||||
|
||||
pth = "{}.pickle".format(platform_name())
|
||||
|
||||
fh = open(os.path.join(output_dir, pth), "wb")
|
||||
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
|
||||
fh.close()
|
||||
|
||||
print("created pickle file: {}".format(pth))
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
sys.exit(
|
||||
"Specify output directory and storage type: generate_legacy_"
|
||||
"storage_files.py <output_dir> <storage_type> "
|
||||
)
|
||||
|
||||
output_dir = str(sys.argv[1])
|
||||
storage_type = str(sys.argv[2])
|
||||
|
||||
if storage_type == "pickle":
|
||||
write_legacy_pickles(output_dir=output_dir)
|
||||
else:
|
||||
sys.exit("storage_type must be one of {'pickle'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,328 +0,0 @@
|
||||
import os
|
||||
|
||||
from shapely.geometry import (
|
||||
LineString,
|
||||
MultiLineString,
|
||||
MultiPoint,
|
||||
MultiPolygon,
|
||||
Point,
|
||||
Polygon,
|
||||
)
|
||||
|
||||
import geopandas
|
||||
from geopandas import GeoDataFrame
|
||||
|
||||
from .test_file import FIONA_MARK, PYOGRIO_MARK
|
||||
|
||||
import pytest
|
||||
from geopandas.testing import assert_geodataframe_equal
|
||||
|
||||
# Credit: Polygons below come from Montreal city Open Data portal
|
||||
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
|
||||
city_hall_boundaries = Polygon(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
)
|
||||
vauquelin_place = Polygon(
|
||||
(
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5548825850032, 45.5084033554357),
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
)
|
||||
)
|
||||
|
||||
city_hall_walls = [
|
||||
LineString(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
)
|
||||
),
|
||||
LineString(
|
||||
(
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
city_hall_entrance = Point(-73.553785, 45.508722)
|
||||
city_hall_balcony = Point(-73.554138, 45.509080)
|
||||
city_hall_council_chamber = Point(-73.554246, 45.508931)
|
||||
|
||||
point_3D = Point(-73.553785, 45.508722, 300)
|
||||
|
||||
|
||||
# *****************************************
|
||||
# TEST TOOLING
|
||||
|
||||
|
||||
class _ExpectedError:
|
||||
def __init__(self, error_type, error_message_match):
|
||||
self.type = error_type
|
||||
self.match = error_message_match
|
||||
|
||||
|
||||
class _ExpectedErrorBuilder:
|
||||
def __init__(self, composite_key):
|
||||
self.composite_key = composite_key
|
||||
|
||||
def to_raise(self, error_type, error_match):
|
||||
_expected_exceptions[self.composite_key] = _ExpectedError(
|
||||
error_type, error_match
|
||||
)
|
||||
|
||||
|
||||
def _expect_writing(gdf, ogr_driver):
|
||||
return _ExpectedErrorBuilder(_composite_key(gdf, ogr_driver))
|
||||
|
||||
|
||||
def _composite_key(gdf, ogr_driver):
|
||||
return frozenset([id(gdf), ogr_driver])
|
||||
|
||||
|
||||
def _expected_error_on(gdf, ogr_driver):
|
||||
composite_key = _composite_key(gdf, ogr_driver)
|
||||
return _expected_exceptions.get(composite_key, None)
|
||||
|
||||
|
||||
# *****************************************
|
||||
# TEST CASES
|
||||
_geodataframes_to_write = []
|
||||
_expected_exceptions = {}
|
||||
_CRS = "epsg:4326"
|
||||
|
||||
# ------------------
|
||||
# gdf with Points
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_entrance, city_hall_balcony]
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with MultiPoints
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPoint([city_hall_balcony, city_hall_council_chamber]),
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony, city_hall_council_chamber]),
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with Points and MultiPoints
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
# 'ESRI Shapefile' driver supports writing LineString/MultiLinestring and
|
||||
# Polygon/MultiPolygon but does not mention Point/MultiPoint
|
||||
# see https://www.gdal.org/drv_shapefile.html
|
||||
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
|
||||
|
||||
# ------------------
|
||||
# gdf with LineStrings
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=city_hall_walls)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with MultiLineStrings
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiLineString(city_hall_walls), MultiLineString(city_hall_walls)],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with LineStrings and MultiLineStrings
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with Polygons
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries, vauquelin_place]
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with MultiPolygon
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with Polygon and MultiPolygon
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with null geometry and Point
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance])
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with null geometry and 3D Point
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, point_3D])
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with null geometries only
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, None])
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with all shape types mixed together
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2, 3, 4, 5, 6]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_entrance,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
# Not supported by 'ESRI Shapefile' driver
|
||||
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
|
||||
|
||||
# ------------------
|
||||
# gdf with all 2D shape types and 3D Point mixed together
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2, 3, 4, 5, 6, 7]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_entrance,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
point_3D,
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
# Not supported by 'ESRI Shapefile' driver
|
||||
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
|
||||
|
||||
|
||||
@pytest.fixture(params=_geodataframes_to_write)
|
||||
def geodataframe(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("GeoJSON", ".geojson"),
|
||||
("ESRI Shapefile", ".shp"),
|
||||
("GPKG", ".gpkg"),
|
||||
("SQLite", ".sqlite"),
|
||||
]
|
||||
)
|
||||
def ogr_driver(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param("fiona", marks=FIONA_MARK),
|
||||
pytest.param("pyogrio", marks=PYOGRIO_MARK),
|
||||
]
|
||||
)
|
||||
def engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
|
||||
driver, ext = ogr_driver
|
||||
output_file = os.path.join(str(tmpdir), "output_file" + ext)
|
||||
write_kwargs = {}
|
||||
if driver == "SQLite":
|
||||
write_kwargs["spatialite"] = True
|
||||
|
||||
# This if statement can be removed once minimal fiona version >= 1.8.20
|
||||
if engine == "fiona":
|
||||
from packaging.version import Version
|
||||
|
||||
import fiona
|
||||
|
||||
if Version(fiona.__version__) < Version("1.8.20"):
|
||||
pytest.skip("SQLite driver only available from version 1.8.20")
|
||||
|
||||
# If only 3D Points, geometry_type needs to be specified for spatialite at the
|
||||
# moment. This if can be removed once the following PR is released:
|
||||
# https://github.com/geopandas/pyogrio/pull/223
|
||||
if (
|
||||
engine == "pyogrio"
|
||||
and len(geodataframe == 2)
|
||||
and geodataframe.geometry[0] is None
|
||||
and geodataframe.geometry[1] is not None
|
||||
and geodataframe.geometry[1].has_z
|
||||
):
|
||||
write_kwargs["geometry_type"] = "Point Z"
|
||||
|
||||
expected_error = _expected_error_on(geodataframe, driver)
|
||||
if expected_error:
|
||||
with pytest.raises(
|
||||
RuntimeError, match="Failed to write record|Could not add feature to layer"
|
||||
):
|
||||
geodataframe.to_file(
|
||||
output_file, driver=driver, engine=engine, **write_kwargs
|
||||
)
|
||||
else:
|
||||
if driver == "SQLite" and engine == "pyogrio":
|
||||
try:
|
||||
geodataframe.to_file(
|
||||
output_file, driver=driver, engine=engine, **write_kwargs
|
||||
)
|
||||
except ValueError as e:
|
||||
if "unrecognized option 'SPATIALITE'" in str(e):
|
||||
pytest.xfail(
|
||||
"pyogrio wheels from PyPI do not come with SpatiaLite support. "
|
||||
f"Error: {e}"
|
||||
)
|
||||
raise
|
||||
else:
|
||||
geodataframe.to_file(
|
||||
output_file, driver=driver, engine=engine, **write_kwargs
|
||||
)
|
||||
|
||||
reloaded = geopandas.read_file(output_file, engine=engine)
|
||||
|
||||
if driver == "GeoJSON" and engine == "pyogrio":
|
||||
# For GeoJSON files, the int64 column comes back as int32
|
||||
reloaded["a"] = reloaded["a"].astype("int64")
|
||||
|
||||
assert_geodataframe_equal(geodataframe, reloaded, check_column_type="equiv")
|
||||
@@ -1,537 +0,0 @@
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from packaging.version import Version
|
||||
|
||||
import numpy as np
|
||||
|
||||
import shapely
|
||||
from shapely import MultiPoint, Point, box
|
||||
|
||||
from geopandas import GeoDataFrame, GeoSeries
|
||||
|
||||
import pytest
|
||||
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
|
||||
|
||||
pytest.importorskip("pyarrow")
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
from pyarrow import feather
|
||||
|
||||
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
|
||||
|
||||
|
||||
def pa_table(table):
|
||||
if Version(pa.__version__) < Version("14.0.0"):
|
||||
return table._pa_table
|
||||
else:
|
||||
return pa.table(table)
|
||||
|
||||
|
||||
def pa_array(array):
|
||||
if Version(pa.__version__) < Version("14.0.0"):
|
||||
return array._pa_array
|
||||
else:
|
||||
return pa.array(array)
|
||||
|
||||
|
||||
def assert_table_equal(left, right, check_metadata=True):
|
||||
geom_type = left["geometry"].type
|
||||
# in case of Points (directly the inner fixed_size_list or struct type)
|
||||
# -> there are NaNs for empties -> we need to compare them separately
|
||||
# and then fill, because pyarrow.Table.equals considers NaNs as not equal
|
||||
if pa.types.is_fixed_size_list(geom_type):
|
||||
left_values = left["geometry"].chunk(0).values
|
||||
right_values = right["geometry"].chunk(0).values
|
||||
assert pc.is_nan(left_values).equals(pc.is_nan(right_values))
|
||||
left_geoms = pa.FixedSizeListArray.from_arrays(
|
||||
pc.replace_with_mask(left_values, pc.is_nan(left_values), 0.0),
|
||||
type=left["geometry"].type,
|
||||
)
|
||||
right_geoms = pa.FixedSizeListArray.from_arrays(
|
||||
pc.replace_with_mask(right_values, pc.is_nan(right_values), 0.0),
|
||||
type=right["geometry"].type,
|
||||
)
|
||||
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
|
||||
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
|
||||
|
||||
elif pa.types.is_struct(geom_type):
|
||||
left_arr = left["geometry"].chunk(0)
|
||||
right_arr = right["geometry"].chunk(0)
|
||||
|
||||
for i in range(left_arr.type.num_fields):
|
||||
assert pc.is_nan(left_arr.field(i)).equals(pc.is_nan(right_arr.field(i)))
|
||||
|
||||
left_geoms = pa.StructArray.from_arrays(
|
||||
[
|
||||
pc.replace_with_mask(
|
||||
left_arr.field(i), pc.is_nan(left_arr.field(i)), 0.0
|
||||
)
|
||||
for i in range(left_arr.type.num_fields)
|
||||
],
|
||||
fields=list(left["geometry"].type),
|
||||
)
|
||||
right_geoms = pa.StructArray.from_arrays(
|
||||
[
|
||||
pc.replace_with_mask(
|
||||
right_arr.field(i), pc.is_nan(right_arr.field(i)), 0.0
|
||||
)
|
||||
for i in range(right_arr.type.num_fields)
|
||||
],
|
||||
fields=list(right["geometry"].type),
|
||||
)
|
||||
|
||||
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
|
||||
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
|
||||
|
||||
if left.equals(right, check_metadata=check_metadata):
|
||||
return
|
||||
|
||||
if not left.schema.equals(right.schema):
|
||||
raise AssertionError(
|
||||
"Schema not equal\nLeft:\n{0}\nRight:\n{1}".format(
|
||||
left.schema, right.schema
|
||||
)
|
||||
)
|
||||
|
||||
if check_metadata:
|
||||
if not left.schema.equals(right.schema, check_metadata=True):
|
||||
if not left.schema.metadata == right.schema.metadata:
|
||||
raise AssertionError(
|
||||
"Metadata not equal\nLeft:\n{0}\nRight:\n{1}".format(
|
||||
left.schema.metadata, right.schema.metadata
|
||||
)
|
||||
)
|
||||
for col in left.schema.names:
|
||||
assert left.schema.field(col).equals(
|
||||
right.schema.field(col), check_metadata=True
|
||||
)
|
||||
|
||||
for col in left.column_names:
|
||||
a_left = pa.concat_arrays(left.column(col).chunks)
|
||||
a_right = pa.concat_arrays(right.column(col).chunks)
|
||||
if not a_left.equals(a_right):
|
||||
raise AssertionError(
|
||||
"Column '{0}' not equal:\n{1}".format(col, a_left.diff(a_right))
|
||||
)
|
||||
|
||||
raise AssertionError("Tables not equal for unknown reason")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
shapely.geos_version < (3, 9, 0),
|
||||
reason="Checking for empty is buggy with GEOS<3.9",
|
||||
) # an old GEOS is installed in the CI builds with the defaults channel
|
||||
@pytest.mark.parametrize(
|
||||
"dim",
|
||||
[
|
||||
"xy",
|
||||
pytest.param(
|
||||
"xyz",
|
||||
marks=pytest.mark.skipif(
|
||||
shapely.geos_version < (3, 10, 0),
|
||||
reason="Cannot write 3D geometries with GEOS<3.10",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_encoding, interleaved",
|
||||
[("WKB", None), ("geoarrow", True), ("geoarrow", False)],
|
||||
ids=["WKB", "geoarrow-interleaved", "geoarrow-separated"],
|
||||
)
|
||||
def test_geoarrow_export(geometry_type, dim, geometry_encoding, interleaved):
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df["row_number"] = df["row_number"].astype("int32")
|
||||
df = GeoDataFrame(df)
|
||||
df.geometry.array.crs = None
|
||||
|
||||
# Read the expected data
|
||||
if geometry_encoding == "WKB":
|
||||
filename = f"example-{suffix}-wkb.arrow"
|
||||
else:
|
||||
filename = f"example-{suffix}{'-interleaved' if interleaved else ''}.arrow"
|
||||
expected = feather.read_table(base_path / filename)
|
||||
|
||||
# GeoDataFrame -> Arrow Table
|
||||
result = pa_table(
|
||||
df.to_arrow(geometry_encoding=geometry_encoding, interleaved=interleaved)
|
||||
)
|
||||
# remove the "pandas" metadata
|
||||
result = result.replace_schema_metadata(None)
|
||||
|
||||
mask_nonempty = None
|
||||
if (
|
||||
geometry_encoding == "WKB"
|
||||
and dim == "xyz"
|
||||
and geometry_type.startswith("multi")
|
||||
):
|
||||
# for collections with z dimension, drop the empties because those don't
|
||||
# roundtrip correctly to WKB
|
||||
# (https://github.com/libgeos/geos/issues/888)
|
||||
mask_nonempty = pa.array(np.asarray(~df.geometry.is_empty))
|
||||
result = result.filter(mask_nonempty)
|
||||
expected = expected.filter(mask_nonempty)
|
||||
|
||||
assert_table_equal(result, expected)
|
||||
|
||||
# GeoSeries -> Arrow array
|
||||
if geometry_encoding != "WKB" and geometry_type == "point":
|
||||
# for points, we again have to handle NaNs separately, we already did that
|
||||
# for table so let's just skip this part
|
||||
return
|
||||
result_arr = pa_array(
|
||||
df.geometry.to_arrow(
|
||||
geometry_encoding=geometry_encoding, interleaved=interleaved
|
||||
)
|
||||
)
|
||||
if mask_nonempty is not None:
|
||||
result_arr = result_arr.filter(mask_nonempty)
|
||||
assert result_arr.equals(expected["geometry"].chunk(0))
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
Version(shapely.__version__) < Version("2.0.2"),
|
||||
reason="from_ragged_array failing with read-only array input",
|
||||
)
|
||||
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
|
||||
def test_geoarrow_multiple_geometry_crs(encoding):
|
||||
pytest.importorskip("pyproj")
|
||||
# ensure each geometry column has its own crs
|
||||
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
|
||||
gdf["geom2"] = gdf.geometry.to_crs("epsg:3857")
|
||||
|
||||
result = pa_table(gdf.to_arrow(geometry_encoding=encoding))
|
||||
meta1 = json.loads(
|
||||
result.schema.field("geometry").metadata[b"ARROW:extension:metadata"]
|
||||
)
|
||||
assert json.loads(meta1["crs"])["id"]["code"] == 4326
|
||||
meta2 = json.loads(
|
||||
result.schema.field("geom2").metadata[b"ARROW:extension:metadata"]
|
||||
)
|
||||
assert json.loads(meta2["crs"])["id"]["code"] == 3857
|
||||
|
||||
roundtripped = GeoDataFrame.from_arrow(result)
|
||||
assert_geodataframe_equal(gdf, roundtripped)
|
||||
assert gdf.geometry.crs == "epsg:4326"
|
||||
assert gdf.geom2.crs == "epsg:3857"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
|
||||
def test_geoarrow_series_name_crs(encoding):
|
||||
pytest.importorskip("pyproj")
|
||||
pytest.importorskip("pyarrow", minversion="14.0.0")
|
||||
|
||||
gser = GeoSeries([box(0, 0, 10, 10)], crs="epsg:4326", name="geom")
|
||||
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
|
||||
field = pa.Field._import_from_c_capsule(schema_capsule)
|
||||
assert field.name == "geom"
|
||||
assert (
|
||||
field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb"
|
||||
if encoding == "WKB"
|
||||
else b"geoarrow.polygon"
|
||||
)
|
||||
meta = json.loads(field.metadata[b"ARROW:extension:metadata"])
|
||||
assert json.loads(meta["crs"])["id"]["code"] == 4326
|
||||
|
||||
# ensure it also works without a name
|
||||
gser = GeoSeries([box(0, 0, 10, 10)])
|
||||
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
|
||||
field = pa.Field._import_from_c_capsule(schema_capsule)
|
||||
assert field.name == ""
|
||||
|
||||
|
||||
def test_geoarrow_unsupported_encoding():
|
||||
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
|
||||
|
||||
with pytest.raises(ValueError, match="Expected geometry encoding"):
|
||||
gdf.to_arrow(geometry_encoding="invalid")
|
||||
|
||||
with pytest.raises(ValueError, match="Expected geometry encoding"):
|
||||
gdf.geometry.to_arrow(geometry_encoding="invalid")
|
||||
|
||||
|
||||
def test_geoarrow_mixed_geometry_types():
|
||||
gdf = GeoDataFrame(
|
||||
{"geometry": [Point(0, 0), box(0, 0, 10, 10)]},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Geometry type combination is not supported"):
|
||||
gdf.to_arrow(geometry_encoding="geoarrow")
|
||||
|
||||
gdf = GeoDataFrame(
|
||||
{"geometry": [Point(0, 0), MultiPoint([(0, 0), (1, 1)])]},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
result = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert (
|
||||
result.schema.field("geometry").metadata[b"ARROW:extension:name"]
|
||||
== b"geoarrow.multipoint"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("geom_type", ["point", "polygon"])
|
||||
@pytest.mark.parametrize(
|
||||
"encoding, interleaved", [("WKB", True), ("geoarrow", True), ("geoarrow", False)]
|
||||
)
|
||||
def test_geoarrow_missing(encoding, interleaved, geom_type):
|
||||
# dummy test for single geometry type until missing values are included
|
||||
# in the test data for test_geoarrow_export
|
||||
gdf = GeoDataFrame(
|
||||
geometry=[Point(0, 0) if geom_type == "point" else box(0, 0, 10, 10), None],
|
||||
crs="epsg:4326",
|
||||
)
|
||||
if (
|
||||
encoding == "geoarrow"
|
||||
and geom_type == "point"
|
||||
and interleaved
|
||||
and Version(pa.__version__) < Version("15.0.0")
|
||||
):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Converting point geometries with missing values is not supported",
|
||||
):
|
||||
gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved)
|
||||
return
|
||||
result = pa_table(gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved))
|
||||
assert result["geometry"].null_count == 1
|
||||
assert result["geometry"].is_null().to_pylist() == [False, True]
|
||||
|
||||
|
||||
def test_geoarrow_include_z():
|
||||
gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1), Point()]})
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert table["geometry"].type.value_field.name == "xy"
|
||||
assert table["geometry"].type.list_size == 2
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=True))
|
||||
assert table["geometry"].type.value_field.name == "xyz"
|
||||
assert table["geometry"].type.list_size == 3
|
||||
assert np.isnan(table["geometry"].chunk(0).values.to_numpy()[2::3]).all()
|
||||
|
||||
gdf = GeoDataFrame({"geometry": [Point(0, 0, 0), Point(1, 1, 1), Point()]})
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert table["geometry"].type.value_field.name == "xyz"
|
||||
assert table["geometry"].type.list_size == 3
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=False))
|
||||
assert table["geometry"].type.value_field.name == "xy"
|
||||
assert table["geometry"].type.list_size == 2
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def with_geoarrow_extension_types():
|
||||
gp = pytest.importorskip("geoarrow.pyarrow")
|
||||
gp.register_extension_types()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
gp.unregister_extension_types()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dim", ["xy", "xyz"])
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
|
||||
)
|
||||
def test_geoarrow_export_with_extension_types(geometry_type, dim):
|
||||
# ensure the exported data can be imported by geoarrow-pyarrow and are
|
||||
# recognized as extension types
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df["row_number"] = df["row_number"].astype("int32")
|
||||
df = GeoDataFrame(df)
|
||||
df.geometry.array.crs = None
|
||||
|
||||
pytest.importorskip("geoarrow.pyarrow")
|
||||
|
||||
with with_geoarrow_extension_types():
|
||||
result1 = pa_table(df.to_arrow(geometry_encoding="WKB"))
|
||||
assert isinstance(result1["geometry"].type, pa.ExtensionType)
|
||||
|
||||
result2 = pa_table(df.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert isinstance(result2["geometry"].type, pa.ExtensionType)
|
||||
|
||||
result3 = pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
|
||||
assert isinstance(result3["geometry"].type, pa.ExtensionType)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
Version(shapely.__version__) < Version("2.0.2"),
|
||||
reason="from_ragged_array failing with read-only array input",
|
||||
)
|
||||
@pytest.mark.parametrize("dim", ["xy", "xyz"])
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
[
|
||||
"point",
|
||||
"linestring",
|
||||
"polygon",
|
||||
"multipoint",
|
||||
"multilinestring",
|
||||
"multipolygon",
|
||||
],
|
||||
)
|
||||
def test_geoarrow_import(geometry_type, dim):
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df = GeoDataFrame(df)
|
||||
df.geometry.crs = None
|
||||
|
||||
table1 = feather.read_table(base_path / f"example-{suffix}-wkb.arrow")
|
||||
result1 = GeoDataFrame.from_arrow(table1)
|
||||
assert_geodataframe_equal(result1, df)
|
||||
|
||||
table2 = feather.read_table(base_path / f"example-{suffix}-interleaved.arrow")
|
||||
result2 = GeoDataFrame.from_arrow(table2)
|
||||
assert_geodataframe_equal(result2, df)
|
||||
|
||||
table3 = feather.read_table(base_path / f"example-{suffix}.arrow")
|
||||
result3 = GeoDataFrame.from_arrow(table3)
|
||||
assert_geodataframe_equal(result3, df)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
Version(shapely.__version__) < Version("2.0.2"),
|
||||
reason="from_ragged_array failing with read-only array input",
|
||||
)
|
||||
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
|
||||
def test_geoarrow_import_geometry_column(encoding):
|
||||
pytest.importorskip("pyproj")
|
||||
# ensure each geometry column has its own crs
|
||||
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)])
|
||||
gdf["centroid"] = gdf.geometry.centroid
|
||||
|
||||
result = GeoDataFrame.from_arrow(pa_table(gdf.to_arrow(geometry_encoding=encoding)))
|
||||
assert_geodataframe_equal(result, gdf)
|
||||
assert result.active_geometry_name == "geometry"
|
||||
|
||||
result = GeoDataFrame.from_arrow(
|
||||
pa_table(gdf[["centroid"]].to_arrow(geometry_encoding=encoding))
|
||||
)
|
||||
assert result.active_geometry_name == "centroid"
|
||||
|
||||
result = GeoDataFrame.from_arrow(
|
||||
pa_table(gdf.to_arrow(geometry_encoding=encoding)), geometry="centroid"
|
||||
)
|
||||
assert result.active_geometry_name == "centroid"
|
||||
assert_geodataframe_equal(result, gdf.set_geometry("centroid"))
|
||||
|
||||
|
||||
def test_geoarrow_import_missing_geometry():
|
||||
pytest.importorskip("pyarrow", minversion="14.0.0")
|
||||
|
||||
table = pa.table({"a": [0, 1, 2], "b": [0.1, 0.2, 0.3]})
|
||||
with pytest.raises(ValueError, match="No geometry column found"):
|
||||
GeoDataFrame.from_arrow(table)
|
||||
|
||||
with pytest.raises(ValueError, match="No GeoArrow geometry field found"):
|
||||
GeoSeries.from_arrow(table["a"].chunk(0))
|
||||
|
||||
|
||||
def test_geoarrow_import_capsule_interface():
|
||||
# ensure we can import non-pyarrow object
|
||||
pytest.importorskip("pyarrow", minversion="14.0.0")
|
||||
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
|
||||
|
||||
result = GeoDataFrame.from_arrow(gdf.to_arrow())
|
||||
assert_geodataframe_equal(result, gdf)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dim", ["xy", "xyz"])
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
|
||||
)
|
||||
def test_geoarrow_import_from_extension_types(geometry_type, dim):
|
||||
# ensure the exported data can be imported by geoarrow-pyarrow and are
|
||||
# recognized as extension types
|
||||
pytest.importorskip("pyproj")
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df = GeoDataFrame(df, crs="EPSG:3857")
|
||||
|
||||
pytest.importorskip("geoarrow.pyarrow")
|
||||
|
||||
with with_geoarrow_extension_types():
|
||||
result1 = GeoDataFrame.from_arrow(
|
||||
pa_table(df.to_arrow(geometry_encoding="WKB"))
|
||||
)
|
||||
assert_geodataframe_equal(result1, df)
|
||||
|
||||
result2 = GeoDataFrame.from_arrow(
|
||||
pa_table(df.to_arrow(geometry_encoding="geoarrow"))
|
||||
)
|
||||
assert_geodataframe_equal(result2, df)
|
||||
|
||||
result3 = GeoDataFrame.from_arrow(
|
||||
pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
|
||||
)
|
||||
assert_geodataframe_equal(result3, df)
|
||||
|
||||
|
||||
def test_geoarrow_import_geoseries():
|
||||
pytest.importorskip("pyproj")
|
||||
gp = pytest.importorskip("geoarrow.pyarrow")
|
||||
ser = GeoSeries.from_wkt(["POINT (1 1)", "POINT (2 2)"], crs="EPSG:3857")
|
||||
|
||||
with with_geoarrow_extension_types():
|
||||
arr = gp.array(ser.to_arrow(geometry_encoding="WKB"))
|
||||
result = GeoSeries.from_arrow(arr)
|
||||
assert_geoseries_equal(result, ser)
|
||||
|
||||
arr = gp.array(ser.to_arrow(geometry_encoding="geoarrow"))
|
||||
result = GeoSeries.from_arrow(arr)
|
||||
assert_geoseries_equal(result, ser)
|
||||
|
||||
# the name is lost when going through a pyarrow.Array
|
||||
ser.name = "name"
|
||||
arr = gp.array(ser.to_arrow())
|
||||
result = GeoSeries.from_arrow(arr)
|
||||
assert result.name is None
|
||||
# we can specify the name as one of the kwargs
|
||||
result = GeoSeries.from_arrow(arr, name="test")
|
||||
assert_geoseries_equal(result, ser)
|
||||
|
||||
|
||||
def test_geoarrow_import_unknown_geoarrow_type():
|
||||
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
|
||||
table = pa_table(gdf.to_arrow())
|
||||
schema = table.schema
|
||||
new_field = schema.field("geometry").with_metadata(
|
||||
{
|
||||
b"ARROW:extension:name": b"geoarrow.unknown",
|
||||
b"ARROW:extension:metadata": b"{}",
|
||||
}
|
||||
)
|
||||
|
||||
new_schema = pa.schema([schema.field(0), new_field])
|
||||
new_table = table.cast(new_schema)
|
||||
|
||||
with pytest.raises(TypeError, match="Unknown GeoArrow extension type"):
|
||||
GeoDataFrame.from_arrow(new_table)
|
||||
@@ -1,306 +0,0 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from shapely.geometry import (
|
||||
LineString,
|
||||
MultiLineString,
|
||||
MultiPoint,
|
||||
MultiPolygon,
|
||||
Point,
|
||||
Polygon,
|
||||
)
|
||||
|
||||
from geopandas import GeoDataFrame
|
||||
from geopandas.io.file import infer_schema
|
||||
|
||||
import pytest
|
||||
|
||||
# Credit: Polygons below come from Montreal city Open Data portal
|
||||
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
|
||||
city_hall_boundaries = Polygon(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
)
|
||||
vauquelin_place = Polygon(
|
||||
(
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5548825850032, 45.5084033554357),
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
)
|
||||
)
|
||||
|
||||
city_hall_walls = [
|
||||
LineString(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
)
|
||||
),
|
||||
LineString(
|
||||
(
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
city_hall_entrance = Point(-73.553785, 45.508722)
|
||||
city_hall_balcony = Point(-73.554138, 45.509080)
|
||||
city_hall_council_chamber = Point(-73.554246, 45.508931)
|
||||
|
||||
point_3D = Point(-73.553785, 45.508722, 300)
|
||||
linestring_3D = LineString(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661, 300),
|
||||
(-73.5546126200639, 45.5086813829106, 300),
|
||||
(-73.5540185061397, 45.5084409343852, 300),
|
||||
)
|
||||
)
|
||||
polygon_3D = Polygon(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661, 300),
|
||||
(-73.5535801792994, 45.5089539203786, 300),
|
||||
(-73.5541107525234, 45.5091983609661, 300),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_infer_schema_only_points():
|
||||
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_points_and_multipoints():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["MultiPoint", "Point"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_multipoints():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPoint(
|
||||
[city_hall_entrance, city_hall_balcony, city_hall_council_chamber]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {"geometry": "MultiPoint", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_only_linestrings():
|
||||
df = GeoDataFrame(geometry=city_hall_walls)
|
||||
|
||||
assert infer_schema(df) == {"geometry": "LineString", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_linestrings_and_multilinestrings():
|
||||
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["MultiLineString", "LineString"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_multilinestrings():
|
||||
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls)])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "MultiLineString",
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_polygons():
|
||||
df = GeoDataFrame(geometry=[city_hall_boundaries, vauquelin_place])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "Polygon", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_polygons_and_multipolygons():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["MultiPolygon", "Polygon"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_multipolygons():
|
||||
df = GeoDataFrame(geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "MultiPolygon", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_multiple_shape_types():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": [
|
||||
"MultiPolygon",
|
||||
"Polygon",
|
||||
"MultiLineString",
|
||||
"LineString",
|
||||
"MultiPoint",
|
||||
"Point",
|
||||
],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_shape_type():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
point_3D,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": [
|
||||
"3D Point",
|
||||
"MultiPolygon",
|
||||
"Polygon",
|
||||
"MultiLineString",
|
||||
"LineString",
|
||||
"MultiPoint",
|
||||
"Point",
|
||||
],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_Point():
|
||||
df = GeoDataFrame(geometry=[city_hall_balcony, point_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["3D Point", "Point"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_3D_Points():
|
||||
df = GeoDataFrame(geometry=[point_3D, point_3D])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_linestring():
|
||||
df = GeoDataFrame(geometry=[city_hall_walls[0], linestring_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["3D LineString", "LineString"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_3D_linestrings():
|
||||
df = GeoDataFrame(geometry=[linestring_3D, linestring_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "3D LineString",
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_Polygon():
|
||||
df = GeoDataFrame(geometry=[city_hall_boundaries, polygon_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["3D Polygon", "Polygon"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_3D_Polygons():
|
||||
df = GeoDataFrame(geometry=[polygon_3D, polygon_3D])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "3D Polygon", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_null_geometry_and_2D_point():
|
||||
df = GeoDataFrame(geometry=[None, city_hall_entrance])
|
||||
|
||||
# None geometry type is then omitted
|
||||
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_null_geometry_and_3D_point():
|
||||
df = GeoDataFrame(geometry=[None, point_3D])
|
||||
|
||||
# None geometry type is then omitted
|
||||
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_null_geometry_all():
|
||||
df = GeoDataFrame(geometry=[None, None])
|
||||
|
||||
# None geometry type in then replaced by 'Unknown'
|
||||
# (default geometry type supported by Fiona)
|
||||
assert infer_schema(df) == {"geometry": "Unknown", "properties": OrderedDict()}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_data,dtype", [([1, 2**31 - 1], np.int32), ([1, np.nan], pd.Int32Dtype())]
|
||||
)
|
||||
def test_infer_schema_int32(array_data, dtype):
|
||||
int32col = pd.array(data=array_data, dtype=dtype)
|
||||
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
|
||||
df["int32_column"] = int32col
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "Point",
|
||||
"properties": OrderedDict([("int32_column", "int32")]),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_int64():
|
||||
int64col = pd.array([1, np.nan], dtype=pd.Int64Dtype())
|
||||
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
|
||||
df["int64_column"] = int64col
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "Point",
|
||||
"properties": OrderedDict([("int64_column", "int")]),
|
||||
}
|
||||
@@ -1,56 +0,0 @@
|
||||
"""
|
||||
See generate_legacy_storage_files.py for the creation of the legacy files.
|
||||
|
||||
"""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import pytest
|
||||
from geopandas.testing import assert_geodataframe_equal
|
||||
|
||||
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def current_pickle_data():
|
||||
# our current version pickle data
|
||||
from .generate_legacy_storage_files import create_pickle_data
|
||||
|
||||
return create_pickle_data()
|
||||
|
||||
|
||||
files = glob.glob(str(DATA_PATH / "pickle" / "*.pickle"))
|
||||
|
||||
|
||||
@pytest.fixture(params=files, ids=[p.split("/")[-1] for p in files])
|
||||
def legacy_pickle(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=(
|
||||
"shapely 2.0/pygeos-based unpickling currently only works for "
|
||||
"shapely-2.0/pygeos-written files"
|
||||
),
|
||||
)
|
||||
def test_legacy_pickles(current_pickle_data, legacy_pickle):
|
||||
result = pd.read_pickle(legacy_pickle)
|
||||
|
||||
for name, value in result.items():
|
||||
expected = current_pickle_data[name]
|
||||
assert_geodataframe_equal(value, expected)
|
||||
|
||||
|
||||
def test_round_trip_current(tmpdir, current_pickle_data):
|
||||
data = current_pickle_data
|
||||
|
||||
for name, value in data.items():
|
||||
path = str(tmpdir / "{}.pickle".format(name))
|
||||
value.to_pickle(path)
|
||||
result = pd.read_pickle(path)
|
||||
assert_geodataframe_equal(result, value)
|
||||
assert isinstance(result.has_sindex, bool)
|
||||
@@ -1,878 +0,0 @@
|
||||
"""
|
||||
Tests here include reading/writing to different types of spatial databases.
|
||||
The spatial database tests may not work without additional system
|
||||
configuration. postGIS tests require a test database to have been setup;
|
||||
see geopandas.tests.util for more information.
|
||||
"""
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from importlib.util import find_spec
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import geopandas
|
||||
import geopandas._compat as compat
|
||||
from geopandas import GeoDataFrame, read_file, read_postgis
|
||||
from geopandas._compat import HAS_PYPROJ
|
||||
from geopandas.io.sql import _get_conn as get_conn
|
||||
from geopandas.io.sql import _write_postgis as write_postgis
|
||||
|
||||
import pytest
|
||||
from geopandas.tests.util import (
|
||||
create_postgis,
|
||||
create_spatialite,
|
||||
mock,
|
||||
validate_boro_df,
|
||||
)
|
||||
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
except ImportError:
|
||||
# Avoid local imports for text in all sqlalchemy tests
|
||||
# all tests using text use engine_postgis, which ensures sqlalchemy is available
|
||||
text = str
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_nybb(nybb_filename):
|
||||
df = read_file(nybb_filename)
|
||||
return df
|
||||
|
||||
|
||||
def check_available_postgis_drivers() -> list[str]:
|
||||
"""Work out which of psycopg2 and psycopg are available.
|
||||
This prevents tests running if the relevant package isn't installed
|
||||
(rather than being skipped, as skips are treated as failures during postgis CI)
|
||||
"""
|
||||
drivers = []
|
||||
if find_spec("psycopg"):
|
||||
drivers.append("psycopg")
|
||||
if find_spec("psycopg2"):
|
||||
drivers.append("psycopg2")
|
||||
return drivers
|
||||
|
||||
|
||||
POSTGIS_DRIVERS = check_available_postgis_drivers()
|
||||
|
||||
|
||||
def prepare_database_credentials() -> dict:
|
||||
"""Gather postgres connection credentials from environment variables."""
|
||||
return {
|
||||
"dbname": "test_geopandas",
|
||||
"user": os.environ.get("PGUSER"),
|
||||
"password": os.environ.get("PGPASSWORD"),
|
||||
"host": os.environ.get("PGHOST"),
|
||||
"port": os.environ.get("PGPORT"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def connection_postgis(request):
|
||||
"""Create a postgres connection using either psycopg2 or psycopg.
|
||||
|
||||
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS."""
|
||||
psycopg = pytest.importorskip(request.param)
|
||||
|
||||
try:
|
||||
con = psycopg.connect(**prepare_database_credentials())
|
||||
except psycopg.OperationalError:
|
||||
pytest.skip("Cannot connect with postgresql database")
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore", message="pandas only supports SQLAlchemy connectable.*"
|
||||
)
|
||||
yield con
|
||||
con.close()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def engine_postgis(request):
|
||||
"""
|
||||
Initiate a sqlalchemy connection engine using either psycopg2 or psycopg.
|
||||
|
||||
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS.
|
||||
"""
|
||||
sqlalchemy = pytest.importorskip("sqlalchemy")
|
||||
from sqlalchemy.engine.url import URL
|
||||
|
||||
credentials = prepare_database_credentials()
|
||||
try:
|
||||
con = sqlalchemy.create_engine(
|
||||
URL.create(
|
||||
drivername=f"postgresql+{request.param}",
|
||||
username=credentials["user"],
|
||||
database=credentials["dbname"],
|
||||
password=credentials["password"],
|
||||
host=credentials["host"],
|
||||
port=credentials["port"],
|
||||
)
|
||||
)
|
||||
con.connect()
|
||||
except Exception:
|
||||
pytest.skip("Cannot connect with postgresql database")
|
||||
|
||||
yield con
|
||||
con.dispose()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def connection_spatialite():
|
||||
"""
|
||||
Return a memory-based SQLite3 connection with SpatiaLite enabled & initialized.
|
||||
|
||||
`The sqlite3 module must be built with loadable extension support
|
||||
<https://docs.python.org/3/library/sqlite3.html#f1>`_ and
|
||||
`SpatiaLite <https://www.gaia-gis.it/fossil/libspatialite/index>`_
|
||||
must be available on the system as a SQLite module.
|
||||
Packages available on Anaconda meet requirements.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
``AttributeError`` on missing support for loadable SQLite extensions
|
||||
``sqlite3.OperationalError`` on missing SpatiaLite
|
||||
"""
|
||||
sqlite3 = pytest.importorskip("sqlite3")
|
||||
try:
|
||||
with sqlite3.connect(":memory:") as con:
|
||||
con.enable_load_extension(True)
|
||||
con.load_extension("mod_spatialite")
|
||||
con.execute("SELECT InitSpatialMetaData(TRUE)")
|
||||
except Exception:
|
||||
con.close()
|
||||
pytest.skip("Cannot setup spatialite database")
|
||||
|
||||
yield con
|
||||
con.close()
|
||||
|
||||
|
||||
def drop_table_if_exists(conn_or_engine, table):
|
||||
sqlalchemy = pytest.importorskip("sqlalchemy")
|
||||
|
||||
if sqlalchemy.inspect(conn_or_engine).has_table(table):
|
||||
metadata = sqlalchemy.MetaData()
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore", message="Did not recognize type 'geometry' of column.*"
|
||||
)
|
||||
metadata.reflect(conn_or_engine)
|
||||
table = metadata.tables.get(table)
|
||||
if table is not None:
|
||||
table.drop(conn_or_engine, checkfirst=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_mixed_single_and_multi():
|
||||
from shapely.geometry import LineString, MultiLineString, Point
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{
|
||||
"geometry": [
|
||||
LineString([(0, 0), (1, 1)]),
|
||||
MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
|
||||
Point(0, 1),
|
||||
]
|
||||
},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_geom_collection():
|
||||
from shapely.geometry import GeometryCollection, LineString, Point, Polygon
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{
|
||||
"geometry": [
|
||||
GeometryCollection(
|
||||
[
|
||||
Polygon([(0, 0), (1, 1), (0, 1)]),
|
||||
LineString([(0, 0), (1, 1)]),
|
||||
Point(0, 0),
|
||||
]
|
||||
)
|
||||
]
|
||||
},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_linear_ring():
|
||||
from shapely.geometry import LinearRing
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{"geometry": [LinearRing(((0, 0), (0, 1), (1, 1), (1, 0)))]}, crs="epsg:4326"
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_3D_geoms():
|
||||
from shapely.geometry import LineString, Point, Polygon
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{
|
||||
"geometry": [
|
||||
LineString([(0, 0, 0), (1, 1, 1)]),
|
||||
Polygon([(0, 0, 0), (1, 1, 1), (0, 1, 1)]),
|
||||
Point(0, 1, 2),
|
||||
]
|
||||
},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
class TestIO:
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_get_conn(self, engine_postgis):
|
||||
Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
|
||||
|
||||
engine = engine_postgis
|
||||
with get_conn(engine) as output:
|
||||
assert isinstance(output, Connection)
|
||||
with engine.connect() as conn:
|
||||
with get_conn(conn) as output:
|
||||
assert isinstance(output, Connection)
|
||||
with pytest.raises(ValueError):
|
||||
with get_conn(object()):
|
||||
pass
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_default(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con)
|
||||
|
||||
validate_boro_df(df)
|
||||
# no crs defined on the created geodatabase, and none specified
|
||||
# by user; should not be set to 0, as from get_srid failure
|
||||
assert df.crs is None
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
geom_col = "the_geom"
|
||||
create_postgis(con, df_nybb, geom_col=geom_col)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con, geom_col=geom_col)
|
||||
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
|
||||
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
|
||||
con = connection_postgis
|
||||
orig_geom = "geom"
|
||||
out_geom = "the_geom"
|
||||
create_postgis(con, df_nybb, geom_col=orig_geom)
|
||||
|
||||
sql = """SELECT borocode, boroname, shape_leng, shape_area,
|
||||
{} as {} FROM nybb;""".format(
|
||||
orig_geom, out_geom
|
||||
)
|
||||
df = read_postgis(sql, con, geom_col=out_geom)
|
||||
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
|
||||
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
|
||||
con = connection_postgis
|
||||
crs = "epsg:4269"
|
||||
df_reproj = df_nybb.to_crs(crs)
|
||||
create_postgis(con, df_reproj, srid=4269)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con)
|
||||
|
||||
validate_boro_df(df)
|
||||
assert df.crs == crs
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
|
||||
"""Tests that a user specified CRS overrides the geodatabase SRID."""
|
||||
con = connection_postgis
|
||||
orig_crs = df_nybb.crs
|
||||
create_postgis(con, df_nybb, srid=4269)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con, crs=orig_crs)
|
||||
|
||||
validate_boro_df(df)
|
||||
assert df.crs == orig_crs
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_from_postgis_default(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = GeoDataFrame.from_postgis(sql, con)
|
||||
|
||||
validate_boro_df(df, case_sensitive=False)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
geom_col = "the_geom"
|
||||
create_postgis(con, df_nybb, geom_col=geom_col)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = GeoDataFrame.from_postgis(sql, con, geom_col=geom_col)
|
||||
|
||||
validate_boro_df(df, case_sensitive=False)
|
||||
|
||||
def test_read_postgis_null_geom(self, connection_spatialite, df_nybb):
|
||||
"""Tests that geometry with NULL is accepted."""
|
||||
con = connection_spatialite
|
||||
geom_col = df_nybb.geometry.name
|
||||
df_nybb.geometry.iat[0] = None
|
||||
create_spatialite(con, df_nybb)
|
||||
sql = (
|
||||
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
|
||||
'AsEWKB("{0}") AS "{0}" FROM nybb'.format(geom_col)
|
||||
)
|
||||
df = read_postgis(sql, con, geom_col=geom_col)
|
||||
validate_boro_df(df)
|
||||
|
||||
def test_read_postgis_binary(self, connection_spatialite, df_nybb):
|
||||
"""Tests that geometry read as binary is accepted."""
|
||||
con = connection_spatialite
|
||||
geom_col = df_nybb.geometry.name
|
||||
create_spatialite(con, df_nybb)
|
||||
sql = (
|
||||
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
|
||||
'ST_AsBinary("{0}") AS "{0}" FROM nybb'.format(geom_col)
|
||||
)
|
||||
df = read_postgis(sql, con, geom_col=geom_col)
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
|
||||
"""Test chunksize argument"""
|
||||
chunksize = 2
|
||||
con = connection_postgis
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
|
||||
|
||||
validate_boro_df(df)
|
||||
# no crs defined on the created geodatabase, and none specified
|
||||
# by user; should not be set to 0, as from get_srid failure
|
||||
assert df.crs is None
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_default(self, engine_postgis, df_nybb):
|
||||
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
|
||||
engine = engine_postgis
|
||||
table = "nybb"
|
||||
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(engine, table)
|
||||
|
||||
# Write to db
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
|
||||
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
|
||||
engine = engine_postgis
|
||||
table = "aTestTable"
|
||||
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(engine, table)
|
||||
|
||||
# Write to db
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
|
||||
# Validate
|
||||
sql = text('SELECT * FROM "{table}";'.format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
|
||||
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
|
||||
with engine_postgis.begin() as con:
|
||||
table = "nybb_con"
|
||||
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(con, table)
|
||||
|
||||
# Write to db
|
||||
write_postgis(df_nybb, con=con, name=table, if_exists="fail")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, con, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that uploading the same table raises error when: if_replace='fail'.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Ensure table exists
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
|
||||
try:
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
|
||||
except ValueError as e:
|
||||
if "already exists" in str(e):
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that replacing a table is possible when: if_replace='replace'.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Ensure table exists
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# Overwrite
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that appending to existing table produces correct results when:
|
||||
if_replace='append'.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
orig_rows, orig_cols = df_nybb.shape
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
new_rows, new_cols = df.shape
|
||||
|
||||
# There should be twice as many rows in the new table
|
||||
assert new_rows == orig_rows * 2, (
|
||||
"There should be {target} rows,found: {current}".format(
|
||||
target=orig_rows * 2, current=new_rows
|
||||
),
|
||||
)
|
||||
# Number of columns should stay the same
|
||||
assert new_cols == orig_cols, (
|
||||
"There should be {target} columns,found: {current}".format(
|
||||
target=orig_cols, current=new_cols
|
||||
),
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that GeoDataFrame can be written to PostGIS without CRS information.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Write to db
|
||||
df_nybb.geometry.array.crs = None
|
||||
with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# Validate that srid is -1
|
||||
sql = text(
|
||||
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
|
||||
schema="public", table=table, geom_col="geometry"
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
target_srid = conn.execute(sql).fetchone()[0]
|
||||
assert target_srid == 0, "SRID should be 0, found %s" % target_srid
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
|
||||
CRS information (GH #2414).
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Write to db
|
||||
df_nybb_esri = df_nybb.to_crs("ESRI:102003")
|
||||
write_postgis(df_nybb_esri, con=engine, name=table, if_exists="replace")
|
||||
# Validate that srid is 102003
|
||||
sql = text(
|
||||
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
|
||||
schema="public", table=table, geom_col="geometry"
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
target_srid = conn.execute(sql).fetchone()[0]
|
||||
assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_geometry_collection(
|
||||
self, engine_postgis, df_geom_collection
|
||||
):
|
||||
"""
|
||||
Tests that writing a mix of different geometry types is possible.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(df_geom_collection, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
geom_type = conn.execute(sql).fetchone()[0]
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
|
||||
assert geom_type.upper() == "GEOMETRYCOLLECTION"
|
||||
assert df.geom_type.unique()[0] == "GeometryCollection"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_mixed_geometry_types(
|
||||
self, engine_postgis, df_mixed_single_and_multi
|
||||
):
|
||||
"""
|
||||
Tests that writing a mix of single and MultiGeometries is possible.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(
|
||||
df_mixed_single_and_multi, con=engine, name=table, if_exists="replace"
|
||||
)
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
res = conn.execute(sql).fetchall()
|
||||
assert res[0][0].upper() == "LINESTRING"
|
||||
assert res[1][0].upper() == "MULTILINESTRING"
|
||||
assert res[2][0].upper() == "POINT"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
|
||||
"""
|
||||
Tests that writing a LinearRing.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(df_linear_ring, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
geom_type = conn.execute(sql).fetchone()[0]
|
||||
|
||||
assert geom_type.upper() == "LINESTRING"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
|
||||
"""
|
||||
Tests writing a LinearRing works.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(
|
||||
df_mixed_single_and_multi,
|
||||
con=engine,
|
||||
name=table,
|
||||
if_exists="replace",
|
||||
chunksize=1,
|
||||
)
|
||||
# Validate row count
|
||||
sql = text("SELECT COUNT(geometry) FROM {table};".format(table=table))
|
||||
with engine.connect() as conn:
|
||||
row_cnt = conn.execute(sql).fetchone()[0]
|
||||
assert row_cnt == 3
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
res = conn.execute(sql).fetchall()
|
||||
assert res[0][0].upper() == "LINESTRING"
|
||||
assert res[1][0].upper() == "MULTILINESTRING"
|
||||
assert res[2][0].upper() == "POINT"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests writing data to alternative schema.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
schema_to_use = "test"
|
||||
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
|
||||
with engine.begin() as conn:
|
||||
conn.execute(sql)
|
||||
|
||||
write_postgis(
|
||||
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
|
||||
)
|
||||
# Validate
|
||||
sql = text(
|
||||
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
|
||||
)
|
||||
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_to_different_schema_when_table_exists(
|
||||
self, engine_postgis, df_nybb
|
||||
):
|
||||
"""
|
||||
Tests writing data to alternative schema.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
schema_to_use = "test"
|
||||
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
|
||||
with engine.begin() as conn:
|
||||
conn.execute(sql)
|
||||
|
||||
try:
|
||||
write_postgis(
|
||||
df_nybb, con=engine, name=table, if_exists="fail", schema=schema_to_use
|
||||
)
|
||||
# Validate
|
||||
sql = text(
|
||||
"SELECT * FROM {schema}.{table};".format(
|
||||
schema=schema_to_use, table=table
|
||||
)
|
||||
)
|
||||
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
# Should raise a ValueError when table exists
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try with replace flag on
|
||||
write_postgis(
|
||||
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
|
||||
)
|
||||
# Validate
|
||||
sql = text(
|
||||
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
|
||||
)
|
||||
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
|
||||
"""
|
||||
Tests writing a geometries with 3 dimensions works.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(df_3D_geoms, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Check that all geometries have 3 dimensions
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
assert list(df.geometry.has_z) == [True, True, True]
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_row_order(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that the row order in db table follows the order of the original frame.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "row_order_test"
|
||||
correct_order = df_nybb["BoroCode"].tolist()
|
||||
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Check that the row order matches
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
assert df["BoroCode"].tolist() == correct_order
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_append_before_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that insert works with if_exists='append' when table does not exist yet.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(engine, table)
|
||||
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
|
||||
|
||||
# Check that the row order matches
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_append_with_different_crs(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that the warning is raised if table CRS differs from frame.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Reproject
|
||||
df_nybb2 = df_nybb.to_crs(epsg=4326)
|
||||
|
||||
# Should raise error when appending
|
||||
with pytest.raises(ValueError, match="CRS of the target table"):
|
||||
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_append_without_crs(self, engine_postgis, df_nybb):
|
||||
# This test was included in #3328 when the default value for no
|
||||
# CRS was changed from an SRID of -1 to 0. This resolves issues
|
||||
# of appending dataframes to postgis that have no CRS as postgis
|
||||
# no CRS value is 0.
|
||||
engine = engine_postgis
|
||||
df_nybb = df_nybb.set_crs(None, allow_override=True)
|
||||
table = "nybb"
|
||||
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# append another dataframe with no crs
|
||||
|
||||
df_nybb2 = df_nybb
|
||||
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
@pytest.mark.xfail(
|
||||
compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
|
||||
reason="Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1",
|
||||
)
|
||||
def test_duplicate_geometry_column_fails(self, engine_postgis):
|
||||
"""
|
||||
Tests that a ValueError is raised if an SQL query returns two geometry columns.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
sql = "select ST_MakePoint(0, 0) as geom, ST_MakePoint(0, 0) as geom;"
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
read_postgis(sql, engine, geom_col="geom")
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_non_epsg_crs(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="esri:54052")
|
||||
create_postgis(con, df_nybb, srid=54052)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con)
|
||||
validate_boro_df(df)
|
||||
assert df.crs == "ESRI:54052"
|
||||
|
||||
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
|
||||
@mock.patch("shapely.get_srid")
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_srid_not_in_table(self, mock_get_srid, connection_postgis, df_nybb):
|
||||
# mock a non-existent srid for edge case if shapely has an srid
|
||||
# not present in postgis table.
|
||||
pyproj = pytest.importorskip("pyproj")
|
||||
|
||||
mock_get_srid.return_value = 99999
|
||||
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="epsg:4326")
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
with pytest.raises(pyproj.exceptions.CRSError, match="crs not found"):
|
||||
with pytest.warns(UserWarning, match="Could not find srid 99999"):
|
||||
read_postgis(sql, con)
|
||||
|
||||
@mock.patch("geopandas.io.sql._get_spatial_ref_sys_df")
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_no_spatial_ref_sys_table_in_postgis(
|
||||
self, mock_get_spatial_ref_sys_df, connection_postgis, df_nybb
|
||||
):
|
||||
# mock for a non-existent spatial_ref_sys database
|
||||
|
||||
mock_get_spatial_ref_sys_df.side_effect = pd.errors.DatabaseError
|
||||
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="epsg:4326")
|
||||
create_postgis(con, df_nybb, srid=4326)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
with pytest.warns(
|
||||
UserWarning, match="Could not find the spatial reference system table"
|
||||
):
|
||||
df = read_postgis(sql, con)
|
||||
|
||||
assert df.crs == "EPSG:4326"
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_non_epsg_crs_chunksize(self, connection_postgis, df_nybb):
|
||||
"""Test chunksize argument with non epsg crs"""
|
||||
chunksize = 2
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="esri:54052")
|
||||
|
||||
create_postgis(con, df_nybb, srid=54052)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
|
||||
|
||||
validate_boro_df(df)
|
||||
assert df.crs == "ESRI:54052"
|
||||
@@ -1,118 +0,0 @@
|
||||
"""Vendored, cut down version of pyogrio/util.py for use with fiona"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def vsi_path(path: str) -> str:
|
||||
"""
|
||||
Ensure path is a local path or a GDAL-compatible vsi path.
|
||||
|
||||
"""
|
||||
|
||||
# path is already in GDAL format
|
||||
if path.startswith("/vsi"):
|
||||
return path
|
||||
|
||||
# Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
|
||||
# URL schemes
|
||||
if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
|
||||
if not path.split("!")[0].endswith(".zip"):
|
||||
return path
|
||||
|
||||
# prefix then allow to proceed with remaining parsing
|
||||
path = f"zip://{path}"
|
||||
|
||||
path, archive, scheme = _parse_uri(path)
|
||||
|
||||
if scheme or archive or path.endswith(".zip"):
|
||||
return _construct_vsi_path(path, archive, scheme)
|
||||
|
||||
return path
|
||||
|
||||
|
||||
# Supported URI schemes and their mapping to GDAL's VSI suffix.
|
||||
SCHEMES = {
|
||||
"file": "file",
|
||||
"zip": "zip",
|
||||
"tar": "tar",
|
||||
"gzip": "gzip",
|
||||
"http": "curl",
|
||||
"https": "curl",
|
||||
"ftp": "curl",
|
||||
"s3": "s3",
|
||||
"gs": "gs",
|
||||
"az": "az",
|
||||
"adls": "adls",
|
||||
"adl": "adls", # fsspec uses this
|
||||
"hdfs": "hdfs",
|
||||
"webhdfs": "webhdfs",
|
||||
# GDAL additionally supports oss and swift for remote filesystems, but
|
||||
# those are for now not added as supported URI
|
||||
}
|
||||
|
||||
CURLSCHEMES = {k for k, v in SCHEMES.items() if v == "curl"}
|
||||
|
||||
|
||||
def _parse_uri(path: str):
|
||||
"""
|
||||
Parse a URI
|
||||
|
||||
Returns a tuples of (path, archive, scheme)
|
||||
|
||||
path : str
|
||||
Parsed path. Includes the hostname and query string in the case
|
||||
of a URI.
|
||||
archive : str
|
||||
Parsed archive path.
|
||||
scheme : str
|
||||
URI scheme such as "https" or "zip+s3".
|
||||
"""
|
||||
parts = urlparse(path, allow_fragments=False)
|
||||
|
||||
# if the scheme is not one of GDAL's supported schemes, return raw path
|
||||
if parts.scheme and not all(p in SCHEMES for p in parts.scheme.split("+")):
|
||||
return path, "", ""
|
||||
|
||||
# we have a URI
|
||||
path = parts.path
|
||||
scheme = parts.scheme or ""
|
||||
|
||||
if parts.query:
|
||||
path += "?" + parts.query
|
||||
|
||||
if parts.scheme and parts.netloc:
|
||||
path = parts.netloc + path
|
||||
|
||||
parts = path.split("!")
|
||||
path = parts.pop() if parts else ""
|
||||
archive = parts.pop() if parts else ""
|
||||
return (path, archive, scheme)
|
||||
|
||||
|
||||
def _construct_vsi_path(path, archive, scheme) -> str:
|
||||
"""Convert a parsed path to a GDAL VSI path"""
|
||||
|
||||
prefix = ""
|
||||
suffix = ""
|
||||
schemes = scheme.split("+")
|
||||
|
||||
if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
|
||||
schemes.insert(0, "zip")
|
||||
|
||||
if schemes:
|
||||
prefix = "/".join(
|
||||
"vsi{0}".format(SCHEMES[p]) for p in schemes if p and p != "file"
|
||||
)
|
||||
|
||||
if schemes[-1] in CURLSCHEMES:
|
||||
suffix = f"{schemes[-1]}://"
|
||||
|
||||
if prefix:
|
||||
if archive:
|
||||
return "/{}/{}{}/{}".format(prefix, suffix, archive, path.lstrip("/"))
|
||||
else:
|
||||
return "/{}/{}{}".format(prefix, suffix, path)
|
||||
|
||||
return path
|
||||
Reference in New Issue
Block a user