del env py

This commit is contained in:
2024-10-11 17:10:34 -07:00
parent 55b630e6c8
commit b010ab0e6d
19334 changed files with 1 additions and 4003544 deletions

View File

@@ -1,614 +0,0 @@
import json
from packaging.version import Version
from typing import Dict, Optional, Tuple
import numpy as np
import pandas as pd
import pyarrow as pa
from numpy.typing import NDArray
import shapely
from shapely import GeometryType
from geopandas import GeoDataFrame
from geopandas._compat import SHAPELY_GE_204
from geopandas.array import from_shapely, from_wkb
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
## GeoPandas -> GeoArrow
class ArrowTable:
"""
Wrapper class for Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.table(gdf.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_table):
self._pa_table = pa_table
def __arrow_c_stream__(self, requested_schema=None):
return self._pa_table.__arrow_c_stream__(requested_schema=requested_schema)
class GeoArrowArray:
"""
Wrapper class for a geometry array as Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_array/stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.array(ser.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_field, pa_array):
self._pa_array = pa_array
self._pa_field = pa_field
def __arrow_c_array__(self, requested_schema=None):
if requested_schema is not None:
raise NotImplementedError(
"Requested schema is not supported for geometry arrays"
)
return (
self._pa_field.__arrow_c_schema__(),
self._pa_array.__arrow_c_array__()[1],
)
def geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
interleaved=True,
include_z=None,
):
"""
Convert GeoDataFrame to a pyarrow.Table.
Parameters
----------
df : GeoDataFrame
The GeoDataFrame to convert.
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
geometry_encoding : {'WKB', 'geoarrow' }, default 'WKB'
The GeoArrow encoding to use for the data conversion.
interleaved : bool, default True
Only relevant for 'geoarrow' encoding. If True, the geometries'
coordinates are interleaved in a single fixed size list array.
If False, the coordinates are stored as separate arrays in a
struct type.
include_z : bool, default None
Only relevant for 'geoarrow' encoding (for WKB, the dimensionality
of the individial geometries is preserved).
If False, return 2D geometries. If True, include the third dimension
in the output (if a geometry has no third dimension, the z-coordinates
will be NaN). By default, will infer the dimensionality from the
input geometries. Note that this inference can be unreliable with
empty geometries (for a guaranteed result, it is recommended to
specify the keyword).
"""
mask = df.dtypes == "geometry"
geometry_columns = df.columns[mask]
geometry_indices = np.asarray(mask).nonzero()[0]
df_attr = pd.DataFrame(df.copy(deep=False))
# replace geometry columns with dummy values -> will get converted to
# Arrow null column (not holding any memory), so we can afterwards
# fill the resulting table with the correct geometry fields
for col in geometry_columns:
df_attr[col] = None
table = pa.Table.from_pandas(df_attr, preserve_index=index)
geometry_encoding_dict = {}
if geometry_encoding.lower() == "geoarrow":
if Version(pa.__version__) < Version("10.0.0"):
raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
# Encode all geometry columns to GeoArrow
for i, col in zip(geometry_indices, geometry_columns):
field, geom_arr = construct_geometry_array(
np.array(df[col].array),
include_z=include_z,
field_name=col,
crs=df[col].crs,
interleaved=interleaved,
)
table = table.set_column(i, field, geom_arr)
geometry_encoding_dict[col] = (
field.metadata[b"ARROW:extension:name"]
.decode()
.removeprefix("geoarrow.")
)
elif geometry_encoding.lower() == "wkb":
# Encode all geometry columns to WKB
for i, col in zip(geometry_indices, geometry_columns):
field, wkb_arr = construct_wkb_array(
np.asarray(df[col].array), field_name=col, crs=df[col].crs
)
table = table.set_column(i, field, wkb_arr)
geometry_encoding_dict[col] = "WKB"
else:
raise ValueError(
f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
)
return table, geometry_encoding_dict
def construct_wkb_array(
shapely_arr: NDArray[np.object_],
*,
field_name: str = "geometry",
crs: Optional[str] = None,
) -> Tuple[pa.Field, pa.Array]:
if shapely.geos_version > (3, 10, 0):
kwargs = {"flavor": "iso"}
else:
if shapely.has_z(shapely_arr).any():
raise ValueError("Cannot write 3D geometries with GEOS<3.10")
kwargs = {}
wkb_arr = shapely.to_wkb(shapely_arr, **kwargs)
extension_metadata = {"ARROW:extension:name": "geoarrow.wkb"}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
field = pa.field(
field_name, type=pa.binary(), nullable=True, metadata=extension_metadata
)
parr = pa.array(np.asarray(wkb_arr), pa.binary())
return field, parr
def _convert_inner_coords(coords, interleaved, dims, mask=None):
if interleaved:
coords_field = pa.field(dims, pa.float64(), nullable=False)
typ = pa.list_(coords_field, len(dims))
if mask is None:
# mask keyword only added in pyarrow 15.0.0
parr = pa.FixedSizeListArray.from_arrays(coords.ravel(), type=typ)
else:
parr = pa.FixedSizeListArray.from_arrays(
coords.ravel(), type=typ, mask=mask
)
else:
if dims == "xy":
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy()], fields=fields, mask=mask
)
else:
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
pa.field("z", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy(), coords[:, 2].copy()],
fields=fields,
mask=mask,
)
return parr
def _linestring_type(point_type):
return pa.list_(pa.field("vertices", point_type, nullable=False))
def _polygon_type(point_type):
return pa.list_(
pa.field(
"rings",
pa.list_(pa.field("vertices", point_type, nullable=False)),
nullable=False,
)
)
def _multipoint_type(point_type):
return pa.list_(pa.field("points", point_type, nullable=False))
def _multilinestring_type(point_type):
return pa.list_(
pa.field("linestrings", _linestring_type(point_type), nullable=False)
)
def _multipolygon_type(point_type):
return pa.list_(pa.field("polygons", _polygon_type(point_type), nullable=False))
def construct_geometry_array(
shapely_arr: NDArray[np.object_],
include_z: Optional[bool] = None,
*,
field_name: str = "geometry",
crs: Optional[str] = None,
interleaved: bool = True,
) -> Tuple[pa.Field, pa.Array]:
# NOTE: this implementation returns a (field, array) pair so that it can set the
# extension metadata on the field without instantiating extension types into the
# global pyarrow registry
geom_type, coords, offsets = shapely.to_ragged_array(
shapely_arr, include_z=include_z
)
mask = shapely.is_missing(shapely_arr)
if mask.any():
if (
geom_type == GeometryType.POINT
and interleaved
and Version(pa.__version__) < Version("15.0.0")
):
raise ValueError(
"Converting point geometries with missing values is not supported "
"for interleaved coordinates with pyarrow < 15.0.0. Please "
"upgrade to a newer version of pyarrow."
)
mask = pa.array(mask, type=pa.bool_())
if geom_type == GeometryType.POINT and not SHAPELY_GE_204:
# bug in shapely < 2.0.4, see https://github.com/shapely/shapely/pull/2034
# this workaround only works if there are no empty points
indices = np.nonzero(mask)[0]
indices = indices - np.arange(len(indices))
coords = np.insert(coords, indices, np.nan, axis=0)
else:
mask = None
if coords.shape[-1] == 2:
dims = "xy"
elif coords.shape[-1] == 3:
dims = "xyz"
else:
raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
extension_metadata: Dict[str, str] = {}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
if geom_type == GeometryType.POINT:
parr = _convert_inner_coords(coords, interleaved, dims, mask=mask)
extension_metadata["ARROW:extension:name"] = "geoarrow.point"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.LINESTRING:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, _linestring_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.POLYGON:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_polygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOINT:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, type=_multipoint_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTILINESTRING:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_multilinestring_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOLYGON:
assert len(offsets) == 3, "Expected three offsets arrays"
ring_offsets, polygon_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
_parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2, mask=mask)
parr = parr.cast(_multipolygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
else:
raise ValueError(f"Unsupported type for geoarrow: {geom_type}")
## GeoArrow -> GeoPandas
def _get_arrow_geometry_field(field):
if (meta := field.metadata) is not None:
if (ext_name := meta.get(b"ARROW:extension:name", None)) is not None:
if ext_name.startswith(b"geoarrow."):
if (
ext_meta := meta.get(b"ARROW:extension:metadata", None)
) is not None:
ext_meta = json.loads(ext_meta.decode())
return ext_name.decode(), ext_meta
if isinstance(field.type, pa.ExtensionType):
ext_name = field.type.extension_name
if ext_name.startswith("geoarrow."):
ext_meta_ser = field.type.__arrow_ext_serialize__()
if ext_meta_ser:
ext_meta = json.loads(ext_meta_ser.decode())
else:
ext_meta = None
return ext_name, ext_meta
return None
def arrow_to_geopandas(table, geometry=None):
"""
Convert Arrow table object to a GeoDataFrame based on GeoArrow extension types.
Parameters
----------
table : pyarrow.Table
The Arrow table to convert.
geometry : str, default None
The name of the geometry column to set as the active geometry
column. If None, the first geometry column found will be used.
Returns
-------
GeoDataFrame
"""
if not isinstance(table, pa.Table):
table = pa.table(table)
geom_fields = []
for i, field in enumerate(table.schema):
geom = _get_arrow_geometry_field(field)
if geom is not None:
geom_fields.append((i, field.name, *geom))
if len(geom_fields) == 0:
raise ValueError("No geometry column found in the Arrow table.")
table_attr = table.drop([f[1] for f in geom_fields])
df = table_attr.to_pandas()
for i, col, ext_name, ext_meta in geom_fields:
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(
construct_shapely_array(table[col].combine_chunks(), ext_name), crs=crs
)
else:
raise TypeError(f"Unknown GeoArrow extension type: {ext_name}")
df.insert(i, col, geom_arr)
return GeoDataFrame(df, geometry=geometry or geom_fields[0][1])
def arrow_to_geometry_array(arr):
"""
Convert Arrow array object (representing single GeoArrow array) to a
geopandas GeometryArray.
Specifically for GeoSeries.from_arrow.
"""
if Version(pa.__version__) < Version("14.0.0"):
raise ValueError("Importing from Arrow requires pyarrow >= 14.0.")
schema_capsule, array_capsule = arr.__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
pa_arr = pa.Array._import_from_c_capsule(field.__arrow_c_schema__(), array_capsule)
geom_info = _get_arrow_geometry_field(field)
if geom_info is None:
raise ValueError("No GeoArrow geometry field found.")
ext_name, ext_meta = geom_info
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(pa_arr), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(construct_shapely_array(pa_arr, ext_name), crs=crs)
else:
raise ValueError(f"Unknown GeoArrow extension type: {ext_name}")
return geom_arr
def _get_inner_coords(arr):
if pa.types.is_struct(arr.type):
if arr.type.num_fields == 2:
coords = np.column_stack(
[np.asarray(arr.field("x")), np.asarray(arr.field("y"))]
)
else:
coords = np.column_stack(
[
np.asarray(arr.field("x")),
np.asarray(arr.field("y")),
np.asarray(arr.field("z")),
]
)
return coords
else:
# fixed size list
return np.asarray(arr.values).reshape(len(arr), -1)
def construct_shapely_array(arr: pa.Array, extension_name: str):
"""
Construct a NumPy array of shapely geometries from a pyarrow.Array
with GeoArrow extension type.
"""
if isinstance(arr, pa.ExtensionArray):
arr = arr.storage
if extension_name == "geoarrow.point":
coords = _get_inner_coords(arr)
result = shapely.from_ragged_array(GeometryType.POINT, coords, None)
elif extension_name == "geoarrow.linestring":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.LINESTRING, coords, offsets)
elif extension_name == "geoarrow.polygon":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(GeometryType.POLYGON, coords, offsets)
elif extension_name == "geoarrow.multipoint":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.MULTIPOINT, coords, offsets)
elif extension_name == "geoarrow.multilinestring":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(
GeometryType.MULTILINESTRING, coords, offsets
)
elif extension_name == "geoarrow.multipolygon":
coords = _get_inner_coords(arr.values.values.values)
offsets3 = np.asarray(arr.offsets)
offsets2 = np.asarray(arr.values.offsets)
offsets1 = np.asarray(arr.values.values.offsets)
offsets = (offsets1, offsets2, offsets3)
result = shapely.from_ragged_array(GeometryType.MULTIPOLYGON, coords, offsets)
else:
raise ValueError(extension_name)
# apply validity mask
if arr.null_count:
mask = np.asarray(arr.is_null())
result = np.where(mask, None, result)
return result

View File

@@ -1,72 +0,0 @@
from packaging.version import Version
import pyarrow
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
for more details.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if Version(pyarrow.__version__) >= Version("14.0.1"):
return
# if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
# installed, use this instead (which also ensures it works if they had
# called `pyarrow_hotfix.uninstall()`)
try:
import pyarrow_hotfix # noqa: F401
except ImportError:
pass
else:
return
# if the hotfix is already installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

View File

@@ -1,913 +0,0 @@
import json
import warnings
from packaging.version import Version
import numpy as np
from pandas import DataFrame, Series
import shapely
import geopandas
from geopandas import GeoDataFrame
from geopandas._compat import import_optional_dependency
from geopandas.array import from_shapely, from_wkb
from .file import _expand_user
METADATA_VERSION = "1.0.0"
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
# reference: https://github.com/opengeospatial/geoparquet
# Metadata structure:
# {
# "geo": {
# "columns": {
# "<name>": {
# "encoding": "WKB"
# "geometry_types": <list of str: REQUIRED>
# "crs": "<PROJJSON or None: OPTIONAL>",
# "orientation": "<'counterclockwise' or None: OPTIONAL>"
# "edges": "planar"
# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
# "epoch": <float: OPTIONAL>
# }
# },
# "primary_column": "<str: REQUIRED>",
# "version": "<METADATA_VERSION>",
#
# # Additional GeoPandas specific metadata (not in metadata spec)
# "creator": {
# "library": "geopandas",
# "version": "<geopandas.__version__>"
# }
# }
# }
def _is_fsspec_url(url):
return (
isinstance(url, str)
and "://" in url
and not url.startswith(("http://", "https://"))
)
def _remove_id_from_member_of_ensembles(json_dict):
"""
Older PROJ versions will not recognize IDs of datum ensemble members that
were added in more recent PROJ database versions.
Cf https://github.com/opengeospatial/geoparquet/discussions/110
and https://github.com/OSGeo/PROJ/pull/3221
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
"""
for key, value in json_dict.items():
if isinstance(value, dict):
_remove_id_from_member_of_ensembles(value)
elif key == "members" and isinstance(value, list):
for member in value:
member.pop("id", None)
# type ids 0 to 7
_geometry_type_names = [
"Point",
"LineString",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
]
_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
def _get_geometry_types(series):
"""
Get unique geometry types from a GeoSeries.
"""
arr_geometry_types = shapely.get_type_id(series.array._data)
# ensure to include "... Z" for 3D geometries
has_z = shapely.has_z(series.array._data)
arr_geometry_types[has_z] += 8
geometry_types = Series(arr_geometry_types).unique().tolist()
# drop missing values (shapely.get_type_id returns -1 for those)
if -1 in geometry_types:
geometry_types.remove(-1)
return sorted([_geometry_type_names[idx] for idx in geometry_types])
def _create_metadata(
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
):
"""Create and encode geo metadata dict.
Parameters
----------
df : GeoDataFrame
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
Returns
-------
dict
"""
if schema_version is None:
if geometry_encoding and any(
encoding != "WKB" for encoding in geometry_encoding.values()
):
schema_version = "1.1.0"
else:
schema_version = METADATA_VERSION
if schema_version not in SUPPORTED_VERSIONS:
raise ValueError(
f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
)
# Construct metadata for each geometry
column_metadata = {}
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
geometry_types = _get_geometry_types(series)
if schema_version[0] == "0":
geometry_types_name = "geometry_type"
if len(geometry_types) == 1:
geometry_types = geometry_types[0]
else:
geometry_types_name = "geometry_types"
crs = None
if series.crs:
if schema_version == "0.1.0":
crs = series.crs.to_wkt()
else: # version >= 0.4.0
crs = series.crs.to_json_dict()
_remove_id_from_member_of_ensembles(crs)
column_metadata[col] = {
"encoding": geometry_encoding[col],
"crs": crs,
geometry_types_name: geometry_types,
}
bbox = series.total_bounds.tolist()
if np.isfinite(bbox).all():
# don't add bbox with NaNs for empty / all-NA geometry column
column_metadata[col]["bbox"] = bbox
if write_covering_bbox:
column_metadata[col]["covering"] = {
"bbox": {
"xmin": ["bbox", "xmin"],
"ymin": ["bbox", "ymin"],
"xmax": ["bbox", "xmax"],
"ymax": ["bbox", "ymax"],
},
}
return {
"primary_column": df._geometry_column_name,
"columns": column_metadata,
"version": schema_version,
"creator": {"library": "geopandas", "version": geopandas.__version__},
}
def _encode_metadata(metadata):
"""Encode metadata dict to UTF-8 JSON string
Parameters
----------
metadata : dict
Returns
-------
UTF-8 encoded JSON string
"""
return json.dumps(metadata).encode("utf-8")
def _decode_metadata(metadata_str):
"""Decode a UTF-8 encoded JSON string to dict
Parameters
----------
metadata_str : string (UTF-8 encoded)
Returns
-------
dict
"""
if metadata_str is None:
return None
return json.loads(metadata_str.decode("utf-8"))
def _validate_dataframe(df):
"""Validate that the GeoDataFrame conforms to requirements for writing
to Parquet format.
Raises `ValueError` if the GeoDataFrame is not valid.
copied from `pandas.io.parquet`
Parameters
----------
df : GeoDataFrame
"""
if not isinstance(df, DataFrame):
raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
raise ValueError("Writing to Parquet/Feather requires string column names")
# index level names must be strings
valid_names = all(
isinstance(name, str) for name in df.index.names if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def _validate_geo_metadata(metadata):
"""Validate geo metadata.
Must not be empty, and must contain the structure specified above.
Raises ValueError if metadata is not valid.
Parameters
----------
metadata : dict
"""
if not metadata:
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
# version was schema_version in 0.1.0
version = metadata.get("version", metadata.get("schema_version"))
if not version:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'version'"
)
required_keys = ("primary_column", "columns")
for key in required_keys:
if metadata.get(key, None) is None:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'{key}'".format(key=key)
)
if not isinstance(metadata["columns"], dict):
raise ValueError("'columns' in 'geo' metadata must be a dict")
# Validate that geometry columns have required metadata and values
# leaving out "geometry_type" for compatibility with 0.1
required_col_keys = ("encoding",)
for col, column_metadata in metadata["columns"].items():
for key in required_col_keys:
if key not in column_metadata:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key "
"'{key}' for column '{col}'".format(key=key, col=col)
)
if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
raise ValueError(
"Only WKB geometry encoding or one of the native encodings "
f"({GEOARROW_ENCODINGS!r}) are supported, "
f"got: {column_metadata['encoding']}"
)
if column_metadata.get("edges", "planar") == "spherical":
warnings.warn(
f"The geo metadata indicate that column '{col}' has spherical edges, "
"but because GeoPandas currently does not support spherical "
"geometry, it ignores this metadata and will interpret the edges of "
"the geometries as planar.",
UserWarning,
stacklevel=4,
)
if "covering" in column_metadata:
covering = column_metadata["covering"]
if "bbox" in covering:
bbox = covering["bbox"]
for var in ["xmin", "ymin", "xmax", "ymax"]:
if var not in bbox.keys():
raise ValueError("Metadata for bbox column is malformed.")
def _geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=None,
):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
from pyarrow import StructArray
from geopandas.io._geoarrow import geopandas_to_arrow
_validate_dataframe(df)
if schema_version is not None:
if geometry_encoding != "WKB" and schema_version != "1.1.0":
raise ValueError(
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
)
table, geometry_encoding_dict = geopandas_to_arrow(
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
)
geo_metadata = _create_metadata(
df,
schema_version=schema_version,
geometry_encoding=geometry_encoding_dict,
write_covering_bbox=write_covering_bbox,
)
if write_covering_bbox:
if "bbox" in df.columns:
raise ValueError(
"An existing column 'bbox' already exists in the dataframe. "
"Please rename to write covering bbox."
)
bounds = df.bounds
bbox_array = StructArray.from_arrays(
[bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
names=["xmin", "ymin", "xmax", "ymax"],
)
table = table.append_column("bbox", bbox_array)
# Store geopandas specific file-level metadata
# This must be done AFTER creating the table or it is not persisted
metadata = table.schema.metadata
metadata.update({b"geo": _encode_metadata(geo_metadata)})
return table.replace_schema_metadata(metadata)
def _to_parquet(
df,
path,
index=None,
compression="snappy",
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=False,
**kwargs,
):
"""
Write a GeoDataFrame to the Parquet format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow'.
This is tracking version 1.0.0 of the GeoParquet specification at:
https://github.com/opengeospatial/geoparquet. Writing older versions is
supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
The encoding to use for the geometry columns. Defaults to "WKB"
for maximum interoperability. Specify "geoarrow" to use one of the
native GeoArrow-based single-geometry type encodings.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
**kwargs
Additional keyword arguments passed to pyarrow.parquet.write_table().
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
path = _expand_user(path)
table = _geopandas_to_arrow(
df,
index=index,
geometry_encoding=geometry_encoding,
schema_version=schema_version,
write_covering_bbox=write_covering_bbox,
)
parquet.write_table(table, path, compression=compression, **kwargs)
def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
"""
Write a GeoDataFrame to the Feather format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow' >= 0.17.
This is tracking version 1.0.0 of the GeoParquet specification for
the metadata at: https://github.com/opengeospatial/geoparquet. Writing
older versions is supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'zstd', 'lz4', 'uncompressed'}, optional
Name of the compression to use. Use ``"uncompressed"`` for no
compression. By default uses LZ4 if available, otherwise uncompressed.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version for the metadata; if not provided
will default to latest supported version.
kwargs
Additional keyword arguments passed to pyarrow.feather.write_feather().
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
feather.write_feather(table, path, compression=compression, **kwargs)
def _arrow_to_geopandas(table, geo_metadata=None):
"""
Helper function with main, shared logic for read_parquet/read_feather.
"""
if geo_metadata is None:
# Note: this path of not passing metadata is also used by dask-geopandas
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
# Find all geometry columns that were read from the file. May
# be a subset if 'columns' parameter is used.
geometry_columns = [
col for col in geo_metadata["columns"] if col in table.column_names
]
result_column_names = list(table.slice(0, 0).to_pandas().columns)
geometry_columns.sort(key=result_column_names.index)
if not len(geometry_columns):
raise ValueError(
"""No geometry columns are included in the columns read from
the Parquet/Feather file. To read this file without geometry columns,
use pandas.read_parquet/read_feather() instead."""
)
geometry = geo_metadata["primary_column"]
# Missing geometry likely indicates a subset of columns was read;
# promote the first available geometry to the primary geometry.
if len(geometry_columns) and geometry not in geometry_columns:
geometry = geometry_columns[0]
# if there are multiple non-primary geometry columns, raise a warning
if len(geometry_columns) > 1:
warnings.warn(
"Multiple non-primary geometry columns read from Parquet/Feather "
"file. The first column read was promoted to the primary geometry.",
stacklevel=3,
)
table_attr = table.drop(geometry_columns)
df = table_attr.to_pandas()
# Convert the WKB columns that are present back to geometry.
for col in geometry_columns:
col_metadata = geo_metadata["columns"][col]
if "crs" in col_metadata:
crs = col_metadata["crs"]
if isinstance(crs, dict):
_remove_id_from_member_of_ensembles(crs)
else:
# per the GeoParquet spec, missing CRS is to be interpreted as
# OGC:CRS84
crs = "OGC:CRS84"
if col_metadata["encoding"] == "WKB":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
else:
from geopandas.io._geoarrow import construct_shapely_array
geom_arr = from_shapely(
construct_shapely_array(
table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
),
crs=crs,
)
df.insert(result_column_names.index(col), col, geom_arr)
return GeoDataFrame(df, geometry=geometry)
def _get_filesystem_path(path, filesystem=None, storage_options=None):
"""
Get the filesystem and path for a given filesystem and path.
If the filesystem is not None then it's just returned as is.
"""
import pyarrow
if (
isinstance(path, str)
and storage_options is None
and filesystem is None
and Version(pyarrow.__version__) >= Version("5.0.0")
):
# Use the native pyarrow filesystem if possible.
try:
from pyarrow.fs import FileSystem
filesystem, path = FileSystem.from_uri(path)
except Exception:
# fallback to use get_handle / fsspec for filesystems
# that pyarrow doesn't support
pass
if _is_fsspec_url(path) and filesystem is None:
fsspec = import_optional_dependency(
"fsspec", extra="fsspec is requred for 'storage_options'."
)
filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
if filesystem is None and storage_options:
raise ValueError(
"Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
)
return filesystem, path
def _ensure_arrow_fs(filesystem):
"""
Simplified version of pyarrow.fs._ensure_filesystem. This is only needed
below because `pyarrow.parquet.read_metadata` does not yet accept a
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
"""
from pyarrow import fs
if isinstance(filesystem, fs.FileSystem):
return filesystem
# handle fsspec-compatible filesystems
try:
import fsspec
except ImportError:
pass
else:
if isinstance(filesystem, fsspec.AbstractFileSystem):
return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
return filesystem
def _validate_and_decode_metadata(metadata):
if metadata is None or b"geo" not in metadata:
raise ValueError(
"""Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead."""
)
# check for malformed metadata
try:
decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
except (TypeError, json.decoder.JSONDecodeError):
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
_validate_geo_metadata(decoded_geo_metadata)
return decoded_geo_metadata
def _read_parquet_schema_and_metadata(path, filesystem):
"""
Opening the Parquet file/dataset a first time to get the schema and metadata.
TODO: we should look into how we can reuse opened dataset for reading the
actual data, to avoid discovering the dataset twice (problem right now is
that the ParquetDataset interface doesn't allow passing the filters on read)
"""
import pyarrow
from pyarrow import parquet
kwargs = {}
if Version(pyarrow.__version__) < Version("15.0.0"):
kwargs = dict(use_legacy_dataset=False)
try:
schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
except Exception:
schema = parquet.read_schema(path, filesystem=filesystem)
metadata = schema.metadata
# read metadata separately to get the raw Parquet FileMetaData metadata
# (pyarrow doesn't properly exposes those in schema.metadata for files
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
if metadata is None or b"geo" not in metadata:
try:
metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
except Exception:
pass
return schema, metadata
def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
"""
Load a Parquet object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_parquet` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the GeoParquet metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow'.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g. host,
port, username, password, etc. For HTTP(S) URLs the key-value pairs are
forwarded to urllib as header options. For other URLs (e.g. starting with
"s3://", and "gcs://") the key-value pairs are forwarded to fsspec. Please
see fsspec and urllib for more details.
When no storage options are provided and a filesystem is implemented by
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
filesystem is preferred. Provide the instantiated fsspec filesystem using
the ``filesystem`` keyword if you wish to use its implementation.
bbox : tuple, optional
Bounding box to be used to filter selection from geoparquet data. This
is only usable if the data was saved with the bbox covering metadata.
Input is of the tuple format (xmin, ymin, xmax, ymax).
**kwargs
Any additional kwargs passed to :func:`pyarrow.parquet.read_table`.
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_parquet("data.parquet") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_parquet(
... "data.parquet",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
import geopandas.io._pyarrow_hotfix # noqa: F401
# TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
# adds filesystem as a keyword and match that.
filesystem = kwargs.pop("filesystem", None)
filesystem, path = _get_filesystem_path(
path, filesystem=filesystem, storage_options=storage_options
)
path = _expand_user(path)
schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
geo_metadata = _validate_and_decode_metadata(metadata)
bbox_filter = (
_get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
)
if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
# by default, bbox column is not read in, so must specify which
# columns are read in if it exists.
if not columns and if_bbox_column_exists:
columns = _get_non_bbox_columns(schema, geo_metadata)
# if both bbox and filters kwargs are used, must splice together.
if "filters" in kwargs:
filters_kwarg = kwargs.pop("filters")
filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
else:
filters = bbox_filter
kwargs["use_pandas_metadata"] = True
table = parquet.read_table(
path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
)
return _arrow_to_geopandas(table, geo_metadata)
def _read_feather(path, columns=None, **kwargs):
"""
Load a Feather object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_feather` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the Feather metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow' >= 0.17.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
**kwargs
Any additional kwargs passed to pyarrow.feather.read_table().
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_feather("data.feather") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_feather(
... "data.feather",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
import geopandas.io._pyarrow_hotfix # noqa: F401
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = feather.read_table(path, columns=columns, **kwargs)
return _arrow_to_geopandas(table)
def _get_parquet_bbox_filter(geo_metadata, bbox):
primary_column = geo_metadata["primary_column"]
if _check_if_covering_in_geo_metadata(geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
elif geo_metadata["columns"][primary_column]["encoding"] == "point":
import pyarrow.compute as pc
return (
(pc.field((primary_column, "x")) >= bbox[0])
& (pc.field((primary_column, "x")) <= bbox[2])
& (pc.field((primary_column, "y")) >= bbox[1])
& (pc.field((primary_column, "y")) <= bbox[3])
)
else:
raise ValueError(
"Specifying 'bbox' not supported for this Parquet file (it should either "
"have a bbox covering column or use 'point' encoding)."
)
def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
import pyarrow.compute as pc
return ~(
(pc.field((bbox_column_name, "xmin")) > bbox[2])
| (pc.field((bbox_column_name, "ymin")) > bbox[3])
| (pc.field((bbox_column_name, "xmax")) < bbox[0])
| (pc.field((bbox_column_name, "ymax")) < bbox[1])
)
def _check_if_covering_in_geo_metadata(geo_metadata):
primary_column = geo_metadata["primary_column"]
return "covering" in geo_metadata["columns"][primary_column].keys()
def _get_bbox_encoding_column_name(geo_metadata):
primary_column = geo_metadata["primary_column"]
return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
def _get_non_bbox_columns(schema, geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
columns = schema.names
if bbox_column_name in columns:
columns.remove(bbox_column_name)
return columns
def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
if bbox_filter is None:
return kwarg_filters
filters_expression = parquet.filters_to_expression(kwarg_filters)
return bbox_filter & filters_expression

View File

@@ -1,851 +0,0 @@
from __future__ import annotations
import os
import urllib.request
import warnings
from io import IOBase
from packaging.version import Version
from pathlib import Path
# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import shapely
from shapely.geometry import mapping
from shapely.geometry.base import BaseGeometry
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
from geopandas.io.util import vsi_path
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
_VALID_URLS.discard("file")
fiona = None
fiona_env = None
fiona_import_error = None
FIONA_GE_19 = False
def _import_fiona():
global fiona
global fiona_env
global fiona_import_error
global FIONA_GE_19
if fiona is None:
try:
import fiona
# only try to import fiona.Env if the main fiona import succeeded
# (otherwise you can get confusing "AttributeError: module 'fiona'
# has no attribute '_loading'" / partially initialized module errors)
try:
from fiona import Env as fiona_env
except ImportError:
try:
from fiona import drivers as fiona_env
except ImportError:
fiona_env = None
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
"1.9.0"
)
except ImportError as err:
fiona = False
fiona_import_error = str(err)
pyogrio = None
pyogrio_import_error = None
def _import_pyogrio():
global pyogrio
global pyogrio_import_error
if pyogrio is None:
try:
import pyogrio
except ImportError as err:
pyogrio = False
pyogrio_import_error = str(err)
def _check_fiona(func):
if not fiona:
raise ImportError(
f"the {func} requires the 'fiona' package, but it is not installed or does "
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
)
def _check_pyogrio(func):
if not pyogrio:
raise ImportError(
f"the {func} requires the 'pyogrio' package, but it is not installed "
"or does not import correctly."
"\nImporting pyogrio resulted in: {pyogrio_import_error}"
)
def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
if metadata is None:
return
if driver != "GPKG":
raise NotImplementedError(
"The 'metadata' keyword is only supported for the GPKG driver."
)
if engine == "fiona" and not FIONA_GE_19:
raise NotImplementedError(
"The 'metadata' keyword is only supported for Fiona >= 1.9."
)
def _check_engine(engine, func):
# if not specified through keyword or option, then default to "pyogrio" if
# installed, otherwise try fiona
if engine is None:
import geopandas
engine = geopandas.options.io_engine
if engine is None:
_import_pyogrio()
if pyogrio:
engine = "pyogrio"
else:
_import_fiona()
if fiona:
engine = "fiona"
if engine == "pyogrio":
_import_pyogrio()
_check_pyogrio(func)
elif engine == "fiona":
_import_fiona()
_check_fiona(func)
elif engine is None:
raise ImportError(
f"The {func} requires the 'pyogrio' or 'fiona' package, "
"but neither is installed or imports correctly."
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
f"\nImporting fiona resulted in: {fiona_import_error}"
)
return engine
_EXTENSION_TO_DRIVER = {
".bna": "BNA",
".dxf": "DXF",
".csv": "CSV",
".shp": "ESRI Shapefile",
".dbf": "ESRI Shapefile",
".json": "GeoJSON",
".geojson": "GeoJSON",
".geojsonl": "GeoJSONSeq",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".gml": "GML",
".xml": "GML",
".gpx": "GPX",
".gtm": "GPSTrackMaker",
".gtz": "GPSTrackMaker",
".tab": "MapInfo File",
".mif": "MapInfo File",
".mid": "MapInfo File",
".dgn": "DGN",
".fgb": "FlatGeobuf",
}
def _expand_user(path):
"""Expand paths that use ~."""
if isinstance(path, str):
path = os.path.expanduser(path)
elif isinstance(path, Path):
path = path.expanduser()
return path
def _is_url(url):
"""Check to see if *url* has a valid protocol."""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def _read_file(
filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
):
"""
Returns a GeoDataFrame from a file or URL.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely
geometry. With engine="fiona", CRS mis-matches are resolved if given a GeoSeries
or GeoDataFrame. With engine="pyogrio", bbox must be in the same CRS as the
dataset. Tuple is (minx, miny, maxx, maxy) to match the bounds property of
shapely geometry objects. Cannot be used with mask.
mask : dict | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter for features that intersect with the given dict-like geojson
geometry, GeoSeries, GeoDataFrame or shapely geometry.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
Cannot be used with bbox. If multiple geometries are passed, this will
first union all geometries, which may be computationally expensive.
columns : list, optional
List of column names to import from the data source. Column names
must exactly match the names in the data source. To avoid reading
any columns (besides the geometry column), pass an empty list-like.
By default reads all columns.
rows : int or slice, default None
Load in specific rows by passing an integer (first `n` rows) or a
slice() object.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "pyogrio" engine, the keyword arguments are passed to
`pyogrio.write_dataframe`. In case of the "fiona" engine, the keyword
arguments are passed to fiona.open`. For more information on possible
keywords, type: ``import pyogrio; help(pyogrio.write_dataframe)``.
Examples
--------
>>> df = geopandas.read_file("nybb.shp") # doctest: +SKIP
Specifying layer of GPKG:
>>> df = geopandas.read_file("file.gpkg", layer='cities') # doctest: +SKIP
Reading only first 10 rows:
>>> df = geopandas.read_file("nybb.shp", rows=10) # doctest: +SKIP
Reading only geometries intersecting ``mask``:
>>> df = geopandas.read_file("nybb.shp", mask=polygon) # doctest: +SKIP
Reading only geometries intersecting ``bbox``:
>>> df = geopandas.read_file("nybb.shp", bbox=(0, 0, 10, 20)) # doctest: +SKIP
Returns
-------
:obj:`geopandas.GeoDataFrame` or :obj:`pandas.DataFrame` :
If `ignore_geometry=True` a :obj:`pandas.DataFrame` will be returned.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
When specifying a URL, geopandas will check if the server supports reading
partial data and in that case pass the URL as is to the underlying engine,
which will then use the network file system handler of GDAL to read from
the URL. Otherwise geopandas will download the data from the URL and pass
all data in-memory to the underlying engine.
If you need more control over how the URL is read, you can specify the
GDAL virtual filesystem manually (e.g. ``/vsicurl/https://...``). See the
GDAL documentation on filesystems for more details
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
"""
engine = _check_engine(engine, "'read_file' function")
filename = _expand_user(filename)
from_bytes = False
if _is_url(filename):
# if it is a url that supports random access -> pass through to
# pyogrio/fiona as is (to support downloading only part of the file)
# otherwise still download manually because pyogrio/fiona don't support
# all types of urls (https://github.com/geopandas/geopandas/issues/2908)
with urllib.request.urlopen(filename) as response:
if not response.headers.get("Accept-Ranges") == "bytes":
filename = response.read()
from_bytes = True
if engine == "pyogrio":
return _read_file_pyogrio(
filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
)
elif engine == "fiona":
if pd.api.types.is_file_like(filename):
data = filename.read()
path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
from_bytes = True
else:
path_or_bytes = filename
return _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=bbox,
mask=mask,
columns=columns,
rows=rows,
**kwargs,
)
else:
raise ValueError(f"unknown engine '{engine}'")
def _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=None,
mask=None,
columns=None,
rows=None,
where=None,
**kwargs,
):
if where is not None and not FIONA_GE_19:
raise NotImplementedError("where requires fiona 1.9+")
if columns is not None:
if "include_fields" in kwargs:
raise ValueError(
"Cannot specify both 'include_fields' and 'columns' keywords"
)
if not FIONA_GE_19:
raise NotImplementedError("'columns' keyword requires fiona 1.9+")
kwargs["include_fields"] = columns
elif "include_fields" in kwargs:
# alias to columns, as this variable is used below to specify column order
# in the dataframe creation
columns = kwargs["include_fields"]
if not from_bytes:
# Opening a file via URL or file-like-object above automatically detects a
# zipped file. In order to match that behavior, attempt to add a zip scheme
# if missing.
path_or_bytes = vsi_path(str(path_or_bytes))
if from_bytes:
reader = fiona.BytesCollection
else:
reader = fiona.open
with fiona_env():
with reader(path_or_bytes, **kwargs) as features:
crs = features.crs_wkt
# attempt to get EPSG code
try:
# fiona 1.9+
epsg = features.crs.to_epsg(confidence_threshold=100)
if epsg is not None:
crs = epsg
except AttributeError:
# fiona <= 1.8
try:
crs = features.crs["init"]
except (TypeError, KeyError):
pass
# handle loading the bounding box
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
assert len(bbox) == 4
# handle loading the mask
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
mask = mapping(mask.to_crs(crs).union_all())
elif isinstance(mask, BaseGeometry):
mask = mapping(mask)
filters = {}
if bbox is not None:
filters["bbox"] = bbox
if mask is not None:
filters["mask"] = mask
if where is not None:
filters["where"] = where
# setup the data loading filter
if rows is not None:
if isinstance(rows, int):
rows = slice(rows)
elif not isinstance(rows, slice):
raise TypeError("'rows' must be an integer or a slice.")
f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
elif filters:
f_filt = features.filter(**filters)
else:
f_filt = features
# get list of columns
columns = columns or list(features.schema["properties"])
datetime_fields = [
k for (k, v) in features.schema["properties"].items() if v == "datetime"
]
if (
kwargs.get("ignore_geometry", False)
or features.schema["geometry"] == "None"
):
df = pd.DataFrame(
[record["properties"] for record in f_filt], columns=columns
)
else:
df = GeoDataFrame.from_features(
f_filt, crs=crs, columns=columns + ["geometry"]
)
for k in datetime_fields:
as_dt = None
# plain try catch for when pandas will raise in the future
# TODO we can tighten the exception type in future when it does
try:
with warnings.catch_warnings():
# pandas 2.x does not yet enforce this behaviour but raises a
# warning -> we want to to suppress this warning for our users,
# and do this by turning it into an error so we take the
# `except` code path to try again with utc=True
warnings.filterwarnings(
"error",
"In a future version of pandas, parsing datetimes with "
"mixed time zones will raise an error",
FutureWarning,
)
as_dt = pd.to_datetime(df[k])
except Exception:
pass
if as_dt is None or as_dt.dtype == "object":
# if to_datetime failed, try again for mixed timezone offsets
# This can still fail if there are invalid datetimes
try:
as_dt = pd.to_datetime(df[k], utc=True)
except Exception:
pass
# if to_datetime succeeded, round datetimes as
# fiona only supports up to ms precision (any microseconds are
# floating point rounding error)
if as_dt is not None and not (as_dt.dtype == "object"):
if PANDAS_GE_20:
df[k] = as_dt.dt.as_unit("ms")
else:
df[k] = as_dt.dt.round(freq="ms")
return df
def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
import pyogrio
if rows is not None:
if isinstance(rows, int):
kwargs["max_features"] = rows
elif isinstance(rows, slice):
if rows.start is not None:
if rows.start < 0:
raise ValueError(
"Negative slice start not supported with the 'pyogrio' engine."
)
kwargs["skip_features"] = rows.start
if rows.stop is not None:
kwargs["max_features"] = rows.stop - (rows.start or 0)
if rows.step is not None:
raise ValueError("slice with step is not supported")
else:
raise TypeError("'rows' must be an integer or a slice.")
if bbox is not None and mask is not None:
# match error message from Fiona
raise ValueError("mask and bbox can not be set together")
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
if len(bbox) != 4:
raise ValueError("'bbox' should be a length-4 tuple.")
if mask is not None:
# NOTE: mask cannot be used at same time as bbox keyword
if isinstance(mask, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
elif isinstance(mask, BaseGeometry):
mask = shapely.unary_union(mask)
elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
# convert GeoJSON to shapely geometry
mask = shapely.geometry.shape(mask)
kwargs["mask"] = mask
if kwargs.pop("ignore_geometry", False):
kwargs["read_geometry"] = False
# translate `ignore_fields`/`include_fields` keyword for back compat with fiona
if "ignore_fields" in kwargs and "include_fields" in kwargs:
raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
elif "ignore_fields" in kwargs:
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'ignore_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
ignore_fields = kwargs.pop("ignore_fields")
fields = pyogrio.read_info(path_or_bytes)["fields"]
include_fields = [col for col in fields if col not in ignore_fields]
kwargs["columns"] = include_fields
elif "include_fields" in kwargs:
# translate `include_fields` keyword for back compat with fiona engine
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'include_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
kwargs["columns"] = kwargs.pop("include_fields")
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
def _detect_driver(path):
"""
Attempt to auto-detect driver based on the extension
"""
try:
# in case the path is a file handle
path = path.name
except AttributeError:
pass
try:
return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
except KeyError:
# Assume it is a shapefile folder for now. In the future,
# will likely raise an exception when the expected
# folder writing behavior is more clearly defined.
return "ESRI Shapefile"
def _to_file(
df,
filename,
driver=None,
schema=None,
index=None,
mode="w",
crs=None,
engine=None,
metadata=None,
**kwargs,
):
"""
Write this GeoDataFrame to an OGR data source
A dictionary of supported OGR providers is available via:
>>> import pyogrio
>>> pyogrio.list_drivers() # doctest: +SKIP
Parameters
----------
df : GeoDataFrame to be written
filename : string
File path or file handle to write to. The path may specify a
GDAL VSI scheme.
driver : string, default None
The OGR format driver used to write the vector file.
If not specified, it attempts to infer it from the file extension.
If no extension is specified, it saves ESRI Shapefile to a folder.
schema : dict, default None
If specified, the schema dictionary is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the schema based on each column's dtype.
Not supported for the "pyogrio" engine.
index : bool, default None
If True, write index into one or more columns (for MultiIndex).
Default None writes the index into one or more columns only if
the index is named, is a MultiIndex, or has a non-integer data
type. If False, no index is written.
.. versionadded:: 0.7
Previously the index was not written.
mode : string, default 'w'
The write mode, 'w' to overwrite the existing file and 'a' to append;
when using the pyogrio engine, you can also pass ``append=True``.
Not all drivers support appending. For the fiona engine, the drivers
that support appending are listed in fiona.supported_drivers or
https://github.com/Toblerity/Fiona/blob/master/fiona/drvsupport.py.
For the pyogrio engine, you should be able to use any driver that
is available in your installation of GDAL that supports append
capability; see the specific driver entry at
https://gdal.org/drivers/vector/index.html for more information.
crs : pyproj.CRS, default None
If specified, the CRS is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the crs based on crs df attribute.
The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
metadata : dict[str, str], default None
Optional metadata to be stored in the file. Keys and values must be
strings. Only supported for the "GPKG" driver
(requires Fiona >= 1.9 or pyogrio >= 0.6).
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "fiona" engine, the keyword arguments are passed to
fiona.open`. For more information on possible keywords, type:
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
the keyword arguments are passed to `pyogrio.write_dataframe`.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
"""
engine = _check_engine(engine, "'to_file' method")
filename = _expand_user(filename)
if index is None:
# Determine if index attribute(s) should be saved to file
# (only if they are named or are non-integer)
index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
if index:
df = df.reset_index(drop=False)
if driver is None:
driver = _detect_driver(filename)
if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
warnings.warn(
"Column names longer than 10 characters will be truncated when saved to "
"ESRI Shapefile.",
stacklevel=3,
)
if (df.dtypes == "geometry").sum() > 1:
raise ValueError(
"GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
"supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
"GeoDataFrame.to_feather, drop additional geometry columns or convert them "
"to a supported format like a well-known text (WKT) using "
"`GeoSeries.to_wkt()`.",
)
_check_metadata_supported(metadata, engine, driver)
if mode not in ("w", "a"):
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
if engine == "pyogrio":
_to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
elif engine == "fiona":
_to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
else:
raise ValueError(f"unknown engine '{engine}'")
def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
if not HAS_PYPROJ and crs:
raise ImportError(
"The 'pyproj' package is required to write a file with a CRS, but it is not"
" installed or does not import correctly."
)
if schema is None:
schema = infer_schema(df)
if crs:
from pyproj import CRS
crs = CRS.from_user_input(crs)
else:
crs = df.crs
with fiona_env():
crs_wkt = None
try:
gdal_version = Version(
fiona.env.get_gdal_release_name().strip("e")
) # GH3147
except (AttributeError, ValueError):
gdal_version = Version("2.0.0") # just assume it is not the latest
if gdal_version >= Version("3.0.0") and crs:
crs_wkt = crs.to_wkt()
elif crs:
crs_wkt = crs.to_wkt("WKT1_GDAL")
with fiona.open(
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
) as colxn:
if metadata is not None:
colxn.update_tags(metadata)
colxn.writerecords(df.iterfeatures())
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
import pyogrio
if schema is not None:
raise ValueError(
"The 'schema' argument is not supported with the 'pyogrio' engine."
)
if mode == "a":
kwargs["append"] = True
if crs is not None:
raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
# for the fiona engine, this check is done in gdf.iterfeatures()
if not df.columns.is_unique:
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
def infer_schema(df):
from collections import OrderedDict
# TODO: test pandas string type and boolean type once released
types = {
"Int32": "int32",
"int32": "int32",
"Int64": "int",
"string": "str",
"boolean": "bool",
}
def convert_type(column, in_type):
if in_type == object:
return "str"
if in_type.name.startswith("datetime64"):
# numpy datetime type regardless of frequency
return "datetime"
if str(in_type) in types:
out_type = types[str(in_type)]
else:
out_type = type(np.zeros(1, in_type).item()).__name__
if out_type == "long":
out_type = "int"
return out_type
properties = OrderedDict(
[
(col, convert_type(col, _type))
for col, _type in zip(df.columns, df.dtypes)
if col != df._geometry_column_name
]
)
if df.empty:
warnings.warn(
"You are attempting to write an empty DataFrame to file. "
"For some drivers, this operation may fail.",
UserWarning,
stacklevel=3,
)
# Since https://github.com/Toblerity/Fiona/issues/446 resolution,
# Fiona allows a list of geometry types
geom_types = _geometry_types(df)
schema = {"geometry": geom_types, "properties": properties}
return schema
def _geometry_types(df):
"""
Determine the geometry types in the GeoDataFrame for the schema.
"""
geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
geom_types = geom_types_3D + geom_types_2D
if len(geom_types) == 0:
# Default geometry type supported by Fiona
# (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
return "Unknown"
if len(geom_types) == 1:
geom_types = geom_types[0]
return geom_types
def _list_layers(filename) -> pd.DataFrame:
"""List layers available in a file.
Provides an overview of layers available in a file or URL together with their
geometry types. When supported by the data source, this includes both spatial and
non-spatial layers. Non-spatial layers are indicated by the ``"geometry_type"``
column being ``None``. GeoPandas will not read such layers but they can be read into
a pd.DataFrame using :func:`pyogrio.read_dataframe`.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
Returns
-------
pandas.DataFrame
A DataFrame with columns "name" and "geometry_type" and one row per layer.
"""
_import_pyogrio()
_check_pyogrio("list_layers")
import pyogrio
return pd.DataFrame(
pyogrio.list_layers(filename), columns=["name", "geometry_type"]
)

View File

@@ -1,473 +0,0 @@
import warnings
from contextlib import contextmanager
from functools import lru_cache
import pandas as pd
import shapely
import shapely.wkb
from geopandas import GeoDataFrame
@contextmanager
def _get_conn(conn_or_engine):
"""
Yield a connection within a transaction context.
Engine.begin() returns a Connection with an implicit Transaction while
Connection.begin() returns the Transaction. This helper will always return a
Connection with an implicit (possibly nested) Transaction.
Parameters
----------
conn_or_engine : Connection or Engine
A sqlalchemy Connection or Engine instance
Returns
-------
Connection
"""
from sqlalchemy.engine.base import Connection, Engine
if isinstance(conn_or_engine, Connection):
if not conn_or_engine.in_transaction():
with conn_or_engine.begin():
yield conn_or_engine
else:
yield conn_or_engine
elif isinstance(conn_or_engine, Engine):
with conn_or_engine.begin() as conn:
yield conn
else:
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
"""
Transforms a pandas DataFrame into a GeoDataFrame.
The column 'geom_col' must be a geometry column in WKB representation.
To be used to convert df based on pd.read_sql to gdf.
Parameters
----------
df : DataFrame
pandas DataFrame with geometry column in WKB representation.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : pyproj.CRS, optional
CRS to use for the returned GeoDataFrame. The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
If not set, tries to determine CRS from the SRID associated with the
first geometry in the database, and assigns that to all geometries.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
Returns
-------
GeoDataFrame
"""
if geom_col not in df:
raise ValueError("Query missing geometry column '{}'".format(geom_col))
if df.columns.to_list().count(geom_col) > 1:
raise ValueError(
f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
"one geometry column is allowed."
)
geoms = df[geom_col].dropna()
if not geoms.empty:
load_geom_bytes = shapely.wkb.loads
"""Load from Python 3 binary."""
def load_geom_text(x):
"""Load from binary encoded as text."""
return shapely.wkb.loads(str(x), hex=True)
if isinstance(geoms.iat[0], bytes):
load_geom = load_geom_bytes
else:
load_geom = load_geom_text
df[geom_col] = geoms = geoms.apply(load_geom)
if crs is None:
srid = shapely.get_srid(geoms.iat[0])
# if no defined SRID in geodatabase, returns SRID of 0
if srid != 0:
try:
spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
except pd.errors.DatabaseError:
warning_msg = (
f"Could not find the spatial reference system table "
f"(spatial_ref_sys) in PostGIS."
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
else:
if not spatial_ref_sys_df.empty:
auth_name = spatial_ref_sys_df["auth_name"].item()
crs = f"{auth_name}:{srid}"
else:
warning_msg = (
f"Could not find srid {srid} in the "
f"spatial_ref_sys table. "
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
return GeoDataFrame(df, crs=crs, geometry=geom_col)
def _read_postgis(
sql,
con,
geom_col="geom",
crs=None,
index_col=None,
coerce_float=True,
parse_dates=None,
params=None,
chunksize=None,
):
"""
Returns a GeoDataFrame corresponding to the result of the query
string, which must contain a geometry column in WKB representation.
It is also possible to use :meth:`~GeoDataFrame.read_file` to read from a database.
Especially for file geodatabases like GeoPackage or SpatiaLite this can be easier.
Parameters
----------
sql : string
SQL query to execute in selecting entries from database, or name
of the table to read from the database.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : dict or str, optional
CRS to use for the returned GeoDataFrame; if not set, tries to
determine CRS from the SRID associated with the first geometry in
the database, and assigns that to all geometries.
chunksize : int, default None
If specified, return an iterator where chunksize is the number of rows to
include in each chunk.
See the documentation for pandas.read_sql for further explanation
of the following parameters:
index_col, coerce_float, parse_dates, params, chunksize
Returns
-------
GeoDataFrame
Examples
--------
PostGIS
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> db_connection_url = "postgresql://myusername:mypassword@myhost:5432/mydatabase"
>>> con = create_engine(db_connection_url) # doctest: +SKIP
>>> sql = "SELECT geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
SpatiaLite
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
"""
if chunksize is None:
# read all in one chunk and return a single GeoDataFrame
df = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
else:
# read data in chunks and return a generator
df_generator = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return (
_df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
)
def _get_geometry_type(gdf):
"""
Get basic geometry type of a GeoDataFrame. See more info from:
https://geoalchemy-2.readthedocs.io/en/latest/types.html#geoalchemy2.types._GISType
Following rules apply:
- if geometries all share the same geometry-type,
geometries are inserted with the given GeometryType with following types:
- Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon,
GeometryCollection.
- LinearRing geometries will be converted into LineString -objects.
- in all other cases, geometries will be inserted with type GEOMETRY:
- a mix of Polygons and MultiPolygons in GeoSeries
- a mix of Points and LineStrings in GeoSeries
- geometry is of type GeometryCollection,
such as GeometryCollection([Point, LineStrings])
- if any of the geometries has Z-coordinate, all records will
be written with 3D.
"""
geom_types = list(gdf.geometry.geom_type.unique())
has_curve = False
for gt in geom_types:
if gt is None:
continue
elif "LinearRing" in gt:
has_curve = True
if len(geom_types) == 1:
if has_curve:
target_geom_type = "LINESTRING"
else:
if geom_types[0] is None:
raise ValueError("No valid geometries in the data.")
else:
target_geom_type = geom_types[0].upper()
else:
target_geom_type = "GEOMETRY"
# Check for 3D-coordinates
if any(gdf.geometry.has_z):
target_geom_type += "Z"
return target_geom_type, has_curve
def _get_srid_from_crs(gdf):
"""
Get EPSG code from CRS if available. If not, return 0.
"""
# Use geoalchemy2 default for srid
# Note: undefined srid in PostGIS is 0
srid = None
warning_msg = (
"Could not parse CRS from the GeoDataFrame. "
"Inserting data without defined CRS."
)
if gdf.crs is not None:
try:
for confidence in (100, 70, 25):
srid = gdf.crs.to_epsg(min_confidence=confidence)
if srid is not None:
break
auth_srid = gdf.crs.to_authority(
auth_name="ESRI", min_confidence=confidence
)
if auth_srid is not None:
srid = int(auth_srid[1])
break
except Exception:
warnings.warn(warning_msg, UserWarning, stacklevel=2)
if srid is None:
srid = 0
warnings.warn(warning_msg, UserWarning, stacklevel=2)
return srid
def _convert_linearring_to_linestring(gdf, geom_name):
from shapely.geometry import LineString
# Todo: Use shapely function once it's implemented:
# https://github.com/shapely/shapely/issues/1617
mask = gdf.geom_type == "LinearRing"
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
lambda geom: LineString(geom)
)
return gdf
def _convert_to_ewkb(gdf, geom_name, srid):
"""Convert geometries to ewkb."""
geoms = shapely.to_wkb(
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
# The gdf will warn that the geometry column doesn't hold in-memory geometries
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
# the user that the dtypes are unexpected.
df = pd.DataFrame(gdf, copy=False)
df[geom_name] = geoms
return df
def _psql_insert_copy(tbl, conn, keys, data_iter):
import csv
import io
s_buf = io.StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ", ".join('"{}"'.format(k) for k in keys)
dbapi_conn = conn.connection
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
tbl.table.schema, tbl.table.name, columns
)
with dbapi_conn.cursor() as cur:
# Use psycopg method if it's available
if hasattr(cur, "copy") and callable(cur.copy):
with cur.copy(sql) as copy:
copy.write(s_buf.read())
else: # otherwise use psycopg2 method
cur.copy_expert(sql, s_buf)
def _write_postgis(
gdf,
name,
con,
schema=None,
if_exists="fail",
index=False,
index_label=None,
chunksize=None,
dtype=None,
):
"""
Upload GeoDataFrame into PostGIS database.
This method requires SQLAlchemy and GeoAlchemy2, and a PostgreSQL
Python driver (e.g. psycopg2) to be installed.
Parameters
----------
name : str
Name of the target table.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the PostGIS database.
if_exists : {'fail', 'replace', 'append'}, default 'fail'
How to behave if the table already exists:
- fail: Raise a ValueError.
- replace: Drop the table before inserting new values.
- append: Insert new values to the existing table.
schema : string, optional
Specify the schema. If None, use default schema: 'public'.
index : bool, default True
Write DataFrame index as a column.
Uses *index_label* as the column name in the table.
index_label : string or sequence, default None
Column label for index column(s).
If None is given (default) and index is True,
then the index names are used.
chunksize : int, optional
Rows will be written in batches of this size at a time.
By default, all rows will be written at once.
dtype : dict of column name to SQL type, default None
Specifying the datatype for columns.
The keys should be the column names and the values
should be the SQLAlchemy types.
Examples
--------
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
/mydatabase";) # doctest: +SKIP
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
"""
try:
from geoalchemy2 import Geometry
from sqlalchemy import text
except ImportError:
raise ImportError("'to_postgis()' requires geoalchemy2 package.")
gdf = gdf.copy()
geom_name = gdf.geometry.name
# Get srid
srid = _get_srid_from_crs(gdf)
# Get geometry type and info whether data contains LinearRing.
geometry_type, has_curve = _get_geometry_type(gdf)
# Build dtype with Geometry
if dtype is not None:
dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
else:
dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
# Convert LinearRing geometries to LineString
if has_curve:
gdf = _convert_linearring_to_linestring(gdf, geom_name)
# Convert geometries to EWKB
gdf = _convert_to_ewkb(gdf, geom_name, srid)
if schema is not None:
schema_name = schema
else:
schema_name = "public"
if if_exists == "append":
# Check that the geometry srid matches with the current GeoDataFrame
with _get_conn(con) as connection:
# Only check SRID if table exists
if connection.dialect.has_table(connection, name, schema):
target_srid = connection.execute(
text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema=schema_name, table=name, geom_col=geom_name
)
)
).fetchone()[0]
if target_srid != srid:
msg = (
"The CRS of the target table (EPSG:{epsg_t}) differs from the "
"CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
epsg_t=target_srid, epsg_src=srid
)
)
raise ValueError(msg)
with _get_conn(con) as connection:
gdf.to_sql(
name,
connection,
schema=schema_name,
if_exists=if_exists,
index=index,
index_label=index_label,
chunksize=chunksize,
dtype=dtype,
method=_psql_insert_copy,
)
@lru_cache
def _get_spatial_ref_sys_df(con, srid):
spatial_ref_sys_sql = (
f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
)
return pd.read_sql(spatial_ref_sys_sql, con)

View File

@@ -1,100 +0,0 @@
"""
Script to create the data and write legacy storage (pickle) files.
Based on pandas' generate_legacy_storage_files.py script.
To use this script, create an environment for which you want to
generate pickles, activate the environment, and run this script as:
$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py \
geopandas/geopandas/io/tests/data/pickle/ pickle
This script generates a storage file for the current arch, system,
The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of geopandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with master). These are then compared.
"""
import os
import pickle
import platform
import sys
import pandas as pd
from shapely.geometry import Point
import geopandas
def create_pickle_data():
"""create the pickle data"""
# custom geometry column name
gdf_the_geom = geopandas.GeoDataFrame(
{"a": [1, 2, 3], "the_geom": [Point(1, 1), Point(2, 2), Point(3, 3)]},
geometry="the_geom",
)
# with crs
gdf_crs = geopandas.GeoDataFrame(
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
crs="EPSG:4326",
)
return {"gdf_the_geom": gdf_the_geom, "gdf_crs": gdf_crs}
def platform_name():
return "_".join(
[
str(geopandas.__version__),
"pd-" + str(pd.__version__),
"py-" + str(platform.python_version()),
str(platform.machine()),
str(platform.system().lower()),
]
)
def write_legacy_pickles(output_dir):
print(
"This script generates a storage file for the current arch, system, "
"and python version"
)
print("geopandas version: {}").format(geopandas.__version__)
print(" output dir : {}".format(output_dir))
print(" storage format: pickle")
pth = "{}.pickle".format(platform_name())
fh = open(os.path.join(output_dir, pth), "wb")
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
fh.close()
print("created pickle file: {}".format(pth))
def main():
if len(sys.argv) != 3:
sys.exit(
"Specify output directory and storage type: generate_legacy_"
"storage_files.py <output_dir> <storage_type> "
)
output_dir = str(sys.argv[1])
storage_type = str(sys.argv[2])
if storage_type == "pickle":
write_legacy_pickles(output_dir=output_dir)
else:
sys.exit("storage_type must be one of {'pickle'}")
if __name__ == "__main__":
main()

View File

@@ -1,328 +0,0 @@
import os
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
import geopandas
from geopandas import GeoDataFrame
from .test_file import FIONA_MARK, PYOGRIO_MARK
import pytest
from geopandas.testing import assert_geodataframe_equal
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
# *****************************************
# TEST TOOLING
class _ExpectedError:
def __init__(self, error_type, error_message_match):
self.type = error_type
self.match = error_message_match
class _ExpectedErrorBuilder:
def __init__(self, composite_key):
self.composite_key = composite_key
def to_raise(self, error_type, error_match):
_expected_exceptions[self.composite_key] = _ExpectedError(
error_type, error_match
)
def _expect_writing(gdf, ogr_driver):
return _ExpectedErrorBuilder(_composite_key(gdf, ogr_driver))
def _composite_key(gdf, ogr_driver):
return frozenset([id(gdf), ogr_driver])
def _expected_error_on(gdf, ogr_driver):
composite_key = _composite_key(gdf, ogr_driver)
return _expected_exceptions.get(composite_key, None)
# *****************************************
# TEST CASES
_geodataframes_to_write = []
_expected_exceptions = {}
_CRS = "epsg:4326"
# ------------------
# gdf with Points
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_entrance, city_hall_balcony]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPoint([city_hall_balcony, city_hall_council_chamber]),
MultiPoint([city_hall_entrance, city_hall_balcony, city_hall_council_chamber]),
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Points and MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony],
)
_geodataframes_to_write.append(gdf)
# 'ESRI Shapefile' driver supports writing LineString/MultiLinestring and
# Polygon/MultiPolygon but does not mention Point/MultiPoint
# see https://www.gdal.org/drv_shapefile.html
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with LineStrings
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=city_hall_walls)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), MultiLineString(city_hall_walls)],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with LineStrings and MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygons
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries, vauquelin_place]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPolygon
gdf = GeoDataFrame(
{"a": [1]},
crs=_CRS,
geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygon and MultiPolygon
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and 3D Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, point_3D])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometries only
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, None])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with all shape types mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with all 2D shape types and 3D Point mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
@pytest.fixture(params=_geodataframes_to_write)
def geodataframe(request):
return request.param
@pytest.fixture(
params=[
("GeoJSON", ".geojson"),
("ESRI Shapefile", ".shp"),
("GPKG", ".gpkg"),
("SQLite", ".sqlite"),
]
)
def ogr_driver(request):
return request.param
@pytest.fixture(
params=[
pytest.param("fiona", marks=FIONA_MARK),
pytest.param("pyogrio", marks=PYOGRIO_MARK),
]
)
def engine(request):
return request.param
def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
driver, ext = ogr_driver
output_file = os.path.join(str(tmpdir), "output_file" + ext)
write_kwargs = {}
if driver == "SQLite":
write_kwargs["spatialite"] = True
# This if statement can be removed once minimal fiona version >= 1.8.20
if engine == "fiona":
from packaging.version import Version
import fiona
if Version(fiona.__version__) < Version("1.8.20"):
pytest.skip("SQLite driver only available from version 1.8.20")
# If only 3D Points, geometry_type needs to be specified for spatialite at the
# moment. This if can be removed once the following PR is released:
# https://github.com/geopandas/pyogrio/pull/223
if (
engine == "pyogrio"
and len(geodataframe == 2)
and geodataframe.geometry[0] is None
and geodataframe.geometry[1] is not None
and geodataframe.geometry[1].has_z
):
write_kwargs["geometry_type"] = "Point Z"
expected_error = _expected_error_on(geodataframe, driver)
if expected_error:
with pytest.raises(
RuntimeError, match="Failed to write record|Could not add feature to layer"
):
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
else:
if driver == "SQLite" and engine == "pyogrio":
try:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
except ValueError as e:
if "unrecognized option 'SPATIALITE'" in str(e):
pytest.xfail(
"pyogrio wheels from PyPI do not come with SpatiaLite support. "
f"Error: {e}"
)
raise
else:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
reloaded = geopandas.read_file(output_file, engine=engine)
if driver == "GeoJSON" and engine == "pyogrio":
# For GeoJSON files, the int64 column comes back as int32
reloaded["a"] = reloaded["a"].astype("int64")
assert_geodataframe_equal(geodataframe, reloaded, check_column_type="equiv")

View File

@@ -1,537 +0,0 @@
import contextlib
import json
import os
import pathlib
from packaging.version import Version
import numpy as np
import shapely
from shapely import MultiPoint, Point, box
from geopandas import GeoDataFrame, GeoSeries
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
pytest.importorskip("pyarrow")
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import feather
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
def pa_table(table):
if Version(pa.__version__) < Version("14.0.0"):
return table._pa_table
else:
return pa.table(table)
def pa_array(array):
if Version(pa.__version__) < Version("14.0.0"):
return array._pa_array
else:
return pa.array(array)
def assert_table_equal(left, right, check_metadata=True):
geom_type = left["geometry"].type
# in case of Points (directly the inner fixed_size_list or struct type)
# -> there are NaNs for empties -> we need to compare them separately
# and then fill, because pyarrow.Table.equals considers NaNs as not equal
if pa.types.is_fixed_size_list(geom_type):
left_values = left["geometry"].chunk(0).values
right_values = right["geometry"].chunk(0).values
assert pc.is_nan(left_values).equals(pc.is_nan(right_values))
left_geoms = pa.FixedSizeListArray.from_arrays(
pc.replace_with_mask(left_values, pc.is_nan(left_values), 0.0),
type=left["geometry"].type,
)
right_geoms = pa.FixedSizeListArray.from_arrays(
pc.replace_with_mask(right_values, pc.is_nan(right_values), 0.0),
type=right["geometry"].type,
)
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
elif pa.types.is_struct(geom_type):
left_arr = left["geometry"].chunk(0)
right_arr = right["geometry"].chunk(0)
for i in range(left_arr.type.num_fields):
assert pc.is_nan(left_arr.field(i)).equals(pc.is_nan(right_arr.field(i)))
left_geoms = pa.StructArray.from_arrays(
[
pc.replace_with_mask(
left_arr.field(i), pc.is_nan(left_arr.field(i)), 0.0
)
for i in range(left_arr.type.num_fields)
],
fields=list(left["geometry"].type),
)
right_geoms = pa.StructArray.from_arrays(
[
pc.replace_with_mask(
right_arr.field(i), pc.is_nan(right_arr.field(i)), 0.0
)
for i in range(right_arr.type.num_fields)
],
fields=list(right["geometry"].type),
)
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
if left.equals(right, check_metadata=check_metadata):
return
if not left.schema.equals(right.schema):
raise AssertionError(
"Schema not equal\nLeft:\n{0}\nRight:\n{1}".format(
left.schema, right.schema
)
)
if check_metadata:
if not left.schema.equals(right.schema, check_metadata=True):
if not left.schema.metadata == right.schema.metadata:
raise AssertionError(
"Metadata not equal\nLeft:\n{0}\nRight:\n{1}".format(
left.schema.metadata, right.schema.metadata
)
)
for col in left.schema.names:
assert left.schema.field(col).equals(
right.schema.field(col), check_metadata=True
)
for col in left.column_names:
a_left = pa.concat_arrays(left.column(col).chunks)
a_right = pa.concat_arrays(right.column(col).chunks)
if not a_left.equals(a_right):
raise AssertionError(
"Column '{0}' not equal:\n{1}".format(col, a_left.diff(a_right))
)
raise AssertionError("Tables not equal for unknown reason")
@pytest.mark.skipif(
shapely.geos_version < (3, 9, 0),
reason="Checking for empty is buggy with GEOS<3.9",
) # an old GEOS is installed in the CI builds with the defaults channel
@pytest.mark.parametrize(
"dim",
[
"xy",
pytest.param(
"xyz",
marks=pytest.mark.skipif(
shapely.geos_version < (3, 10, 0),
reason="Cannot write 3D geometries with GEOS<3.10",
),
),
],
)
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
@pytest.mark.parametrize(
"geometry_encoding, interleaved",
[("WKB", None), ("geoarrow", True), ("geoarrow", False)],
ids=["WKB", "geoarrow-interleaved", "geoarrow-separated"],
)
def test_geoarrow_export(geometry_type, dim, geometry_encoding, interleaved):
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df["row_number"] = df["row_number"].astype("int32")
df = GeoDataFrame(df)
df.geometry.array.crs = None
# Read the expected data
if geometry_encoding == "WKB":
filename = f"example-{suffix}-wkb.arrow"
else:
filename = f"example-{suffix}{'-interleaved' if interleaved else ''}.arrow"
expected = feather.read_table(base_path / filename)
# GeoDataFrame -> Arrow Table
result = pa_table(
df.to_arrow(geometry_encoding=geometry_encoding, interleaved=interleaved)
)
# remove the "pandas" metadata
result = result.replace_schema_metadata(None)
mask_nonempty = None
if (
geometry_encoding == "WKB"
and dim == "xyz"
and geometry_type.startswith("multi")
):
# for collections with z dimension, drop the empties because those don't
# roundtrip correctly to WKB
# (https://github.com/libgeos/geos/issues/888)
mask_nonempty = pa.array(np.asarray(~df.geometry.is_empty))
result = result.filter(mask_nonempty)
expected = expected.filter(mask_nonempty)
assert_table_equal(result, expected)
# GeoSeries -> Arrow array
if geometry_encoding != "WKB" and geometry_type == "point":
# for points, we again have to handle NaNs separately, we already did that
# for table so let's just skip this part
return
result_arr = pa_array(
df.geometry.to_arrow(
geometry_encoding=geometry_encoding, interleaved=interleaved
)
)
if mask_nonempty is not None:
result_arr = result_arr.filter(mask_nonempty)
assert result_arr.equals(expected["geometry"].chunk(0))
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_multiple_geometry_crs(encoding):
pytest.importorskip("pyproj")
# ensure each geometry column has its own crs
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
gdf["geom2"] = gdf.geometry.to_crs("epsg:3857")
result = pa_table(gdf.to_arrow(geometry_encoding=encoding))
meta1 = json.loads(
result.schema.field("geometry").metadata[b"ARROW:extension:metadata"]
)
assert json.loads(meta1["crs"])["id"]["code"] == 4326
meta2 = json.loads(
result.schema.field("geom2").metadata[b"ARROW:extension:metadata"]
)
assert json.loads(meta2["crs"])["id"]["code"] == 3857
roundtripped = GeoDataFrame.from_arrow(result)
assert_geodataframe_equal(gdf, roundtripped)
assert gdf.geometry.crs == "epsg:4326"
assert gdf.geom2.crs == "epsg:3857"
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_series_name_crs(encoding):
pytest.importorskip("pyproj")
pytest.importorskip("pyarrow", minversion="14.0.0")
gser = GeoSeries([box(0, 0, 10, 10)], crs="epsg:4326", name="geom")
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
assert field.name == "geom"
assert (
field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb"
if encoding == "WKB"
else b"geoarrow.polygon"
)
meta = json.loads(field.metadata[b"ARROW:extension:metadata"])
assert json.loads(meta["crs"])["id"]["code"] == 4326
# ensure it also works without a name
gser = GeoSeries([box(0, 0, 10, 10)])
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
assert field.name == ""
def test_geoarrow_unsupported_encoding():
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
with pytest.raises(ValueError, match="Expected geometry encoding"):
gdf.to_arrow(geometry_encoding="invalid")
with pytest.raises(ValueError, match="Expected geometry encoding"):
gdf.geometry.to_arrow(geometry_encoding="invalid")
def test_geoarrow_mixed_geometry_types():
gdf = GeoDataFrame(
{"geometry": [Point(0, 0), box(0, 0, 10, 10)]},
crs="epsg:4326",
)
with pytest.raises(ValueError, match="Geometry type combination is not supported"):
gdf.to_arrow(geometry_encoding="geoarrow")
gdf = GeoDataFrame(
{"geometry": [Point(0, 0), MultiPoint([(0, 0), (1, 1)])]},
crs="epsg:4326",
)
result = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert (
result.schema.field("geometry").metadata[b"ARROW:extension:name"]
== b"geoarrow.multipoint"
)
@pytest.mark.parametrize("geom_type", ["point", "polygon"])
@pytest.mark.parametrize(
"encoding, interleaved", [("WKB", True), ("geoarrow", True), ("geoarrow", False)]
)
def test_geoarrow_missing(encoding, interleaved, geom_type):
# dummy test for single geometry type until missing values are included
# in the test data for test_geoarrow_export
gdf = GeoDataFrame(
geometry=[Point(0, 0) if geom_type == "point" else box(0, 0, 10, 10), None],
crs="epsg:4326",
)
if (
encoding == "geoarrow"
and geom_type == "point"
and interleaved
and Version(pa.__version__) < Version("15.0.0")
):
with pytest.raises(
ValueError,
match="Converting point geometries with missing values is not supported",
):
gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved)
return
result = pa_table(gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved))
assert result["geometry"].null_count == 1
assert result["geometry"].is_null().to_pylist() == [False, True]
def test_geoarrow_include_z():
gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1), Point()]})
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert table["geometry"].type.value_field.name == "xy"
assert table["geometry"].type.list_size == 2
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=True))
assert table["geometry"].type.value_field.name == "xyz"
assert table["geometry"].type.list_size == 3
assert np.isnan(table["geometry"].chunk(0).values.to_numpy()[2::3]).all()
gdf = GeoDataFrame({"geometry": [Point(0, 0, 0), Point(1, 1, 1), Point()]})
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert table["geometry"].type.value_field.name == "xyz"
assert table["geometry"].type.list_size == 3
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=False))
assert table["geometry"].type.value_field.name == "xy"
assert table["geometry"].type.list_size == 2
@contextlib.contextmanager
def with_geoarrow_extension_types():
gp = pytest.importorskip("geoarrow.pyarrow")
gp.register_extension_types()
try:
yield
finally:
gp.unregister_extension_types()
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
def test_geoarrow_export_with_extension_types(geometry_type, dim):
# ensure the exported data can be imported by geoarrow-pyarrow and are
# recognized as extension types
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df["row_number"] = df["row_number"].astype("int32")
df = GeoDataFrame(df)
df.geometry.array.crs = None
pytest.importorskip("geoarrow.pyarrow")
with with_geoarrow_extension_types():
result1 = pa_table(df.to_arrow(geometry_encoding="WKB"))
assert isinstance(result1["geometry"].type, pa.ExtensionType)
result2 = pa_table(df.to_arrow(geometry_encoding="geoarrow"))
assert isinstance(result2["geometry"].type, pa.ExtensionType)
result3 = pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
assert isinstance(result3["geometry"].type, pa.ExtensionType)
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
[
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
],
)
def test_geoarrow_import(geometry_type, dim):
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df = GeoDataFrame(df)
df.geometry.crs = None
table1 = feather.read_table(base_path / f"example-{suffix}-wkb.arrow")
result1 = GeoDataFrame.from_arrow(table1)
assert_geodataframe_equal(result1, df)
table2 = feather.read_table(base_path / f"example-{suffix}-interleaved.arrow")
result2 = GeoDataFrame.from_arrow(table2)
assert_geodataframe_equal(result2, df)
table3 = feather.read_table(base_path / f"example-{suffix}.arrow")
result3 = GeoDataFrame.from_arrow(table3)
assert_geodataframe_equal(result3, df)
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_import_geometry_column(encoding):
pytest.importorskip("pyproj")
# ensure each geometry column has its own crs
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)])
gdf["centroid"] = gdf.geometry.centroid
result = GeoDataFrame.from_arrow(pa_table(gdf.to_arrow(geometry_encoding=encoding)))
assert_geodataframe_equal(result, gdf)
assert result.active_geometry_name == "geometry"
result = GeoDataFrame.from_arrow(
pa_table(gdf[["centroid"]].to_arrow(geometry_encoding=encoding))
)
assert result.active_geometry_name == "centroid"
result = GeoDataFrame.from_arrow(
pa_table(gdf.to_arrow(geometry_encoding=encoding)), geometry="centroid"
)
assert result.active_geometry_name == "centroid"
assert_geodataframe_equal(result, gdf.set_geometry("centroid"))
def test_geoarrow_import_missing_geometry():
pytest.importorskip("pyarrow", minversion="14.0.0")
table = pa.table({"a": [0, 1, 2], "b": [0.1, 0.2, 0.3]})
with pytest.raises(ValueError, match="No geometry column found"):
GeoDataFrame.from_arrow(table)
with pytest.raises(ValueError, match="No GeoArrow geometry field found"):
GeoSeries.from_arrow(table["a"].chunk(0))
def test_geoarrow_import_capsule_interface():
# ensure we can import non-pyarrow object
pytest.importorskip("pyarrow", minversion="14.0.0")
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
result = GeoDataFrame.from_arrow(gdf.to_arrow())
assert_geodataframe_equal(result, gdf)
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
def test_geoarrow_import_from_extension_types(geometry_type, dim):
# ensure the exported data can be imported by geoarrow-pyarrow and are
# recognized as extension types
pytest.importorskip("pyproj")
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df = GeoDataFrame(df, crs="EPSG:3857")
pytest.importorskip("geoarrow.pyarrow")
with with_geoarrow_extension_types():
result1 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="WKB"))
)
assert_geodataframe_equal(result1, df)
result2 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="geoarrow"))
)
assert_geodataframe_equal(result2, df)
result3 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
)
assert_geodataframe_equal(result3, df)
def test_geoarrow_import_geoseries():
pytest.importorskip("pyproj")
gp = pytest.importorskip("geoarrow.pyarrow")
ser = GeoSeries.from_wkt(["POINT (1 1)", "POINT (2 2)"], crs="EPSG:3857")
with with_geoarrow_extension_types():
arr = gp.array(ser.to_arrow(geometry_encoding="WKB"))
result = GeoSeries.from_arrow(arr)
assert_geoseries_equal(result, ser)
arr = gp.array(ser.to_arrow(geometry_encoding="geoarrow"))
result = GeoSeries.from_arrow(arr)
assert_geoseries_equal(result, ser)
# the name is lost when going through a pyarrow.Array
ser.name = "name"
arr = gp.array(ser.to_arrow())
result = GeoSeries.from_arrow(arr)
assert result.name is None
# we can specify the name as one of the kwargs
result = GeoSeries.from_arrow(arr, name="test")
assert_geoseries_equal(result, ser)
def test_geoarrow_import_unknown_geoarrow_type():
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
table = pa_table(gdf.to_arrow())
schema = table.schema
new_field = schema.field("geometry").with_metadata(
{
b"ARROW:extension:name": b"geoarrow.unknown",
b"ARROW:extension:metadata": b"{}",
}
)
new_schema = pa.schema([schema.field(0), new_field])
new_table = table.cast(new_schema)
with pytest.raises(TypeError, match="Unknown GeoArrow extension type"):
GeoDataFrame.from_arrow(new_table)

View File

@@ -1,306 +0,0 @@
from collections import OrderedDict
import numpy as np
import pandas as pd
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
from geopandas import GeoDataFrame
from geopandas.io.file import infer_schema
import pytest
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
linestring_3D = LineString(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5546126200639, 45.5086813829106, 300),
(-73.5540185061397, 45.5084409343852, 300),
)
)
polygon_3D = Polygon(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5535801792994, 45.5089539203786, 300),
(-73.5541107525234, 45.5091983609661, 300),
)
)
def test_infer_schema_only_points():
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_points_and_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPoint", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint(
[city_hall_entrance, city_hall_balcony, city_hall_council_chamber]
)
]
)
assert infer_schema(df) == {"geometry": "MultiPoint", "properties": OrderedDict()}
def test_infer_schema_only_linestrings():
df = GeoDataFrame(geometry=city_hall_walls)
assert infer_schema(df) == {"geometry": "LineString", "properties": OrderedDict()}
def test_infer_schema_linestrings_and_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]])
assert infer_schema(df) == {
"geometry": ["MultiLineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls)])
assert infer_schema(df) == {
"geometry": "MultiLineString",
"properties": OrderedDict(),
}
def test_infer_schema_only_polygons():
df = GeoDataFrame(geometry=[city_hall_boundaries, vauquelin_place])
assert infer_schema(df) == {"geometry": "Polygon", "properties": OrderedDict()}
def test_infer_schema_polygons_and_multipolygons():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPolygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipolygons():
df = GeoDataFrame(geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))])
assert infer_schema(df) == {"geometry": "MultiPolygon", "properties": OrderedDict()}
def test_infer_schema_multiple_shape_types():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": [
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_shape_type():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
]
)
assert infer_schema(df) == {
"geometry": [
"3D Point",
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Point():
df = GeoDataFrame(geometry=[city_hall_balcony, point_3D])
assert infer_schema(df) == {
"geometry": ["3D Point", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Points():
df = GeoDataFrame(geometry=[point_3D, point_3D])
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_mixed_3D_linestring():
df = GeoDataFrame(geometry=[city_hall_walls[0], linestring_3D])
assert infer_schema(df) == {
"geometry": ["3D LineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_linestrings():
df = GeoDataFrame(geometry=[linestring_3D, linestring_3D])
assert infer_schema(df) == {
"geometry": "3D LineString",
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Polygon():
df = GeoDataFrame(geometry=[city_hall_boundaries, polygon_3D])
assert infer_schema(df) == {
"geometry": ["3D Polygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Polygons():
df = GeoDataFrame(geometry=[polygon_3D, polygon_3D])
assert infer_schema(df) == {"geometry": "3D Polygon", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_2D_point():
df = GeoDataFrame(geometry=[None, city_hall_entrance])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_3D_point():
df = GeoDataFrame(geometry=[None, point_3D])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_all():
df = GeoDataFrame(geometry=[None, None])
# None geometry type in then replaced by 'Unknown'
# (default geometry type supported by Fiona)
assert infer_schema(df) == {"geometry": "Unknown", "properties": OrderedDict()}
@pytest.mark.parametrize(
"array_data,dtype", [([1, 2**31 - 1], np.int32), ([1, np.nan], pd.Int32Dtype())]
)
def test_infer_schema_int32(array_data, dtype):
int32col = pd.array(data=array_data, dtype=dtype)
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int32_column"] = int32col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int32_column", "int32")]),
}
def test_infer_schema_int64():
int64col = pd.array([1, np.nan], dtype=pd.Int64Dtype())
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int64_column"] = int64col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int64_column", "int")]),
}

View File

@@ -1,56 +0,0 @@
"""
See generate_legacy_storage_files.py for the creation of the legacy files.
"""
import glob
import os
import pathlib
import pandas as pd
import pytest
from geopandas.testing import assert_geodataframe_equal
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
@pytest.fixture(scope="module")
def current_pickle_data():
# our current version pickle data
from .generate_legacy_storage_files import create_pickle_data
return create_pickle_data()
files = glob.glob(str(DATA_PATH / "pickle" / "*.pickle"))
@pytest.fixture(params=files, ids=[p.split("/")[-1] for p in files])
def legacy_pickle(request):
return request.param
@pytest.mark.skip(
reason=(
"shapely 2.0/pygeos-based unpickling currently only works for "
"shapely-2.0/pygeos-written files"
),
)
def test_legacy_pickles(current_pickle_data, legacy_pickle):
result = pd.read_pickle(legacy_pickle)
for name, value in result.items():
expected = current_pickle_data[name]
assert_geodataframe_equal(value, expected)
def test_round_trip_current(tmpdir, current_pickle_data):
data = current_pickle_data
for name, value in data.items():
path = str(tmpdir / "{}.pickle".format(name))
value.to_pickle(path)
result = pd.read_pickle(path)
assert_geodataframe_equal(result, value)
assert isinstance(result.has_sindex, bool)

View File

@@ -1,878 +0,0 @@
"""
Tests here include reading/writing to different types of spatial databases.
The spatial database tests may not work without additional system
configuration. postGIS tests require a test database to have been setup;
see geopandas.tests.util for more information.
"""
import os
import warnings
from importlib.util import find_spec
import pandas as pd
import geopandas
import geopandas._compat as compat
from geopandas import GeoDataFrame, read_file, read_postgis
from geopandas._compat import HAS_PYPROJ
from geopandas.io.sql import _get_conn as get_conn
from geopandas.io.sql import _write_postgis as write_postgis
import pytest
from geopandas.tests.util import (
create_postgis,
create_spatialite,
mock,
validate_boro_df,
)
try:
from sqlalchemy import text
except ImportError:
# Avoid local imports for text in all sqlalchemy tests
# all tests using text use engine_postgis, which ensures sqlalchemy is available
text = str
@pytest.fixture
def df_nybb(nybb_filename):
df = read_file(nybb_filename)
return df
def check_available_postgis_drivers() -> list[str]:
"""Work out which of psycopg2 and psycopg are available.
This prevents tests running if the relevant package isn't installed
(rather than being skipped, as skips are treated as failures during postgis CI)
"""
drivers = []
if find_spec("psycopg"):
drivers.append("psycopg")
if find_spec("psycopg2"):
drivers.append("psycopg2")
return drivers
POSTGIS_DRIVERS = check_available_postgis_drivers()
def prepare_database_credentials() -> dict:
"""Gather postgres connection credentials from environment variables."""
return {
"dbname": "test_geopandas",
"user": os.environ.get("PGUSER"),
"password": os.environ.get("PGPASSWORD"),
"host": os.environ.get("PGHOST"),
"port": os.environ.get("PGPORT"),
}
@pytest.fixture()
def connection_postgis(request):
"""Create a postgres connection using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS."""
psycopg = pytest.importorskip(request.param)
try:
con = psycopg.connect(**prepare_database_credentials())
except psycopg.OperationalError:
pytest.skip("Cannot connect with postgresql database")
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="pandas only supports SQLAlchemy connectable.*"
)
yield con
con.close()
@pytest.fixture()
def engine_postgis(request):
"""
Initiate a sqlalchemy connection engine using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS.
"""
sqlalchemy = pytest.importorskip("sqlalchemy")
from sqlalchemy.engine.url import URL
credentials = prepare_database_credentials()
try:
con = sqlalchemy.create_engine(
URL.create(
drivername=f"postgresql+{request.param}",
username=credentials["user"],
database=credentials["dbname"],
password=credentials["password"],
host=credentials["host"],
port=credentials["port"],
)
)
con.connect()
except Exception:
pytest.skip("Cannot connect with postgresql database")
yield con
con.dispose()
@pytest.fixture()
def connection_spatialite():
"""
Return a memory-based SQLite3 connection with SpatiaLite enabled & initialized.
`The sqlite3 module must be built with loadable extension support
<https://docs.python.org/3/library/sqlite3.html#f1>`_ and
`SpatiaLite <https://www.gaia-gis.it/fossil/libspatialite/index>`_
must be available on the system as a SQLite module.
Packages available on Anaconda meet requirements.
Exceptions
----------
``AttributeError`` on missing support for loadable SQLite extensions
``sqlite3.OperationalError`` on missing SpatiaLite
"""
sqlite3 = pytest.importorskip("sqlite3")
try:
with sqlite3.connect(":memory:") as con:
con.enable_load_extension(True)
con.load_extension("mod_spatialite")
con.execute("SELECT InitSpatialMetaData(TRUE)")
except Exception:
con.close()
pytest.skip("Cannot setup spatialite database")
yield con
con.close()
def drop_table_if_exists(conn_or_engine, table):
sqlalchemy = pytest.importorskip("sqlalchemy")
if sqlalchemy.inspect(conn_or_engine).has_table(table):
metadata = sqlalchemy.MetaData()
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="Did not recognize type 'geometry' of column.*"
)
metadata.reflect(conn_or_engine)
table = metadata.tables.get(table)
if table is not None:
table.drop(conn_or_engine, checkfirst=True)
@pytest.fixture
def df_mixed_single_and_multi():
from shapely.geometry import LineString, MultiLineString, Point
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0), (1, 1)]),
MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
Point(0, 1),
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_geom_collection():
from shapely.geometry import GeometryCollection, LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
"geometry": [
GeometryCollection(
[
Polygon([(0, 0), (1, 1), (0, 1)]),
LineString([(0, 0), (1, 1)]),
Point(0, 0),
]
)
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_linear_ring():
from shapely.geometry import LinearRing
df = geopandas.GeoDataFrame(
{"geometry": [LinearRing(((0, 0), (0, 1), (1, 1), (1, 0)))]}, crs="epsg:4326"
)
return df
@pytest.fixture
def df_3D_geoms():
from shapely.geometry import LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0, 0), (1, 1, 1)]),
Polygon([(0, 0, 0), (1, 1, 1), (0, 1, 1)]),
Point(0, 1, 2),
]
},
crs="epsg:4326",
)
return df
class TestIO:
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_get_conn(self, engine_postgis):
Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
engine = engine_postgis
with get_conn(engine) as output:
assert isinstance(output, Connection)
with engine.connect() as conn:
with get_conn(conn) as output:
assert isinstance(output, Connection)
with pytest.raises(ValueError):
with get_conn(object()):
pass
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
con = connection_postgis
orig_geom = "geom"
out_geom = "the_geom"
create_postgis(con, df_nybb, geom_col=orig_geom)
sql = """SELECT borocode, boroname, shape_leng, shape_area,
{} as {} FROM nybb;""".format(
orig_geom, out_geom
)
df = read_postgis(sql, con, geom_col=out_geom)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
con = connection_postgis
crs = "epsg:4269"
df_reproj = df_nybb.to_crs(crs)
create_postgis(con, df_reproj, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
"""Tests that a user specified CRS overrides the geodatabase SRID."""
con = connection_postgis
orig_crs = df_nybb.crs
create_postgis(con, df_nybb, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, crs=orig_crs)
validate_boro_df(df)
assert df.crs == orig_crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con)
validate_boro_df(df, case_sensitive=False)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df, case_sensitive=False)
def test_read_postgis_null_geom(self, connection_spatialite, df_nybb):
"""Tests that geometry with NULL is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
df_nybb.geometry.iat[0] = None
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'AsEWKB("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
def test_read_postgis_binary(self, connection_spatialite, df_nybb):
"""Tests that geometry read as binary is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'ST_AsBinary("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument"""
chunksize = 2
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_default(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
engine = engine_postgis
table = "aTestTable"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text('SELECT * FROM "{table}";'.format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
with engine_postgis.begin() as con:
table = "nybb_con"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(con, table)
# Write to db
write_postgis(df_nybb, con=con, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, con, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that uploading the same table raises error when: if_replace='fail'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
try:
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
except ValueError as e:
if "already exists" in str(e):
pass
else:
raise e
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that replacing a table is possible when: if_replace='replace'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Overwrite
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that appending to existing table produces correct results when:
if_replace='append'.
"""
engine = engine_postgis
table = "nybb"
orig_rows, orig_cols = df_nybb.shape
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
new_rows, new_cols = df.shape
# There should be twice as many rows in the new table
assert new_rows == orig_rows * 2, (
"There should be {target} rows,found: {current}".format(
target=orig_rows * 2, current=new_rows
),
)
# Number of columns should stay the same
assert new_cols == orig_cols, (
"There should be {target} columns,found: {current}".format(
target=orig_cols, current=new_cols
),
)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS without CRS information.
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb.geometry.array.crs = None
with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate that srid is -1
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 0, "SRID should be 0, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
CRS information (GH #2414).
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb_esri = df_nybb.to_crs("ESRI:102003")
write_postgis(df_nybb_esri, con=engine, name=table, if_exists="replace")
# Validate that srid is 102003
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_geometry_collection(
self, engine_postgis, df_geom_collection
):
"""
Tests that writing a mix of different geometry types is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_geom_collection, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert geom_type.upper() == "GEOMETRYCOLLECTION"
assert df.geom_type.unique()[0] == "GeometryCollection"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_mixed_geometry_types(
self, engine_postgis, df_mixed_single_and_multi
):
"""
Tests that writing a mix of single and MultiGeometries is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi, con=engine, name=table, if_exists="replace"
)
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
"""
Tests that writing a LinearRing.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_linear_ring, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
assert geom_type.upper() == "LINESTRING"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
"""
Tests writing a LinearRing works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi,
con=engine,
name=table,
if_exists="replace",
chunksize=1,
)
# Validate row count
sql = text("SELECT COUNT(geometry) FROM {table};".format(table=table))
with engine.connect() as conn:
row_cnt = conn.execute(sql).fetchone()[0]
assert row_cnt == 3
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema_when_table_exists(
self, engine_postgis, df_nybb
):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
try:
write_postgis(
df_nybb, con=engine, name=table, if_exists="fail", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(
schema=schema_to_use, table=table
)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
# Should raise a ValueError when table exists
except ValueError:
pass
# Try with replace flag on
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
"""
Tests writing a geometries with 3 dimensions works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_3D_geoms, con=engine, name=table, if_exists="replace")
# Check that all geometries have 3 dimensions
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert list(df.geometry.has_z) == [True, True, True]
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_row_order(self, engine_postgis, df_nybb):
"""
Tests that the row order in db table follows the order of the original frame.
"""
engine = engine_postgis
table = "row_order_test"
correct_order = df_nybb["BoroCode"].tolist()
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert df["BoroCode"].tolist() == correct_order
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_before_table_exists(self, engine_postgis, df_nybb):
"""
Tests that insert works with if_exists='append' when table does not exist yet.
"""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_with_different_crs(self, engine_postgis, df_nybb):
"""
Tests that the warning is raised if table CRS differs from frame.
"""
engine = engine_postgis
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Reproject
df_nybb2 = df_nybb.to_crs(epsg=4326)
# Should raise error when appending
with pytest.raises(ValueError, match="CRS of the target table"):
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_without_crs(self, engine_postgis, df_nybb):
# This test was included in #3328 when the default value for no
# CRS was changed from an SRID of -1 to 0. This resolves issues
# of appending dataframes to postgis that have no CRS as postgis
# no CRS value is 0.
engine = engine_postgis
df_nybb = df_nybb.set_crs(None, allow_override=True)
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# append another dataframe with no crs
df_nybb2 = df_nybb
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
@pytest.mark.xfail(
compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
reason="Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1",
)
def test_duplicate_geometry_column_fails(self, engine_postgis):
"""
Tests that a ValueError is raised if an SQL query returns two geometry columns.
"""
engine = engine_postgis
sql = "select ST_MakePoint(0, 0) as geom, ST_MakePoint(0, 0) as geom;"
with pytest.raises(ValueError):
read_postgis(sql, engine, geom_col="geom")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs(self, connection_postgis, df_nybb):
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == "ESRI:54052"
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
@mock.patch("shapely.get_srid")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_srid_not_in_table(self, mock_get_srid, connection_postgis, df_nybb):
# mock a non-existent srid for edge case if shapely has an srid
# not present in postgis table.
pyproj = pytest.importorskip("pyproj")
mock_get_srid.return_value = 99999
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
with pytest.raises(pyproj.exceptions.CRSError, match="crs not found"):
with pytest.warns(UserWarning, match="Could not find srid 99999"):
read_postgis(sql, con)
@mock.patch("geopandas.io.sql._get_spatial_ref_sys_df")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_no_spatial_ref_sys_table_in_postgis(
self, mock_get_spatial_ref_sys_df, connection_postgis, df_nybb
):
# mock for a non-existent spatial_ref_sys database
mock_get_spatial_ref_sys_df.side_effect = pd.errors.DatabaseError
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb, srid=4326)
sql = "SELECT * FROM nybb;"
with pytest.warns(
UserWarning, match="Could not find the spatial reference system table"
):
df = read_postgis(sql, con)
assert df.crs == "EPSG:4326"
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument with non epsg crs"""
chunksize = 2
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
assert df.crs == "ESRI:54052"

View File

@@ -1,118 +0,0 @@
"""Vendored, cut down version of pyogrio/util.py for use with fiona"""
import re
import sys
from urllib.parse import urlparse
def vsi_path(path: str) -> str:
"""
Ensure path is a local path or a GDAL-compatible vsi path.
"""
# path is already in GDAL format
if path.startswith("/vsi"):
return path
# Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
# URL schemes
if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
if not path.split("!")[0].endswith(".zip"):
return path
# prefix then allow to proceed with remaining parsing
path = f"zip://{path}"
path, archive, scheme = _parse_uri(path)
if scheme or archive or path.endswith(".zip"):
return _construct_vsi_path(path, archive, scheme)
return path
# Supported URI schemes and their mapping to GDAL's VSI suffix.
SCHEMES = {
"file": "file",
"zip": "zip",
"tar": "tar",
"gzip": "gzip",
"http": "curl",
"https": "curl",
"ftp": "curl",
"s3": "s3",
"gs": "gs",
"az": "az",
"adls": "adls",
"adl": "adls", # fsspec uses this
"hdfs": "hdfs",
"webhdfs": "webhdfs",
# GDAL additionally supports oss and swift for remote filesystems, but
# those are for now not added as supported URI
}
CURLSCHEMES = {k for k, v in SCHEMES.items() if v == "curl"}
def _parse_uri(path: str):
"""
Parse a URI
Returns a tuples of (path, archive, scheme)
path : str
Parsed path. Includes the hostname and query string in the case
of a URI.
archive : str
Parsed archive path.
scheme : str
URI scheme such as "https" or "zip+s3".
"""
parts = urlparse(path, allow_fragments=False)
# if the scheme is not one of GDAL's supported schemes, return raw path
if parts.scheme and not all(p in SCHEMES for p in parts.scheme.split("+")):
return path, "", ""
# we have a URI
path = parts.path
scheme = parts.scheme or ""
if parts.query:
path += "?" + parts.query
if parts.scheme and parts.netloc:
path = parts.netloc + path
parts = path.split("!")
path = parts.pop() if parts else ""
archive = parts.pop() if parts else ""
return (path, archive, scheme)
def _construct_vsi_path(path, archive, scheme) -> str:
"""Convert a parsed path to a GDAL VSI path"""
prefix = ""
suffix = ""
schemes = scheme.split("+")
if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
schemes.insert(0, "zip")
if schemes:
prefix = "/".join(
"vsi{0}".format(SCHEMES[p]) for p in schemes if p and p != "file"
)
if schemes[-1] in CURLSCHEMES:
suffix = f"{schemes[-1]}://"
if prefix:
if archive:
return "/{}/{}{}/{}".format(prefix, suffix, archive, path.lstrip("/"))
else:
return "/{}/{}{}".format(prefix, suffix, path)
return path