del env py

This commit is contained in:
2024-10-11 17:10:34 -07:00
parent 55b630e6c8
commit b010ab0e6d
19334 changed files with 1 additions and 4003544 deletions

View File

@@ -1,29 +0,0 @@
from geopandas._config import options
from geopandas.geoseries import GeoSeries
from geopandas.geodataframe import GeoDataFrame
from geopandas.array import points_from_xy
from geopandas.io.file import _read_file as read_file
from geopandas.io.file import _list_layers as list_layers
from geopandas.io.arrow import _read_parquet as read_parquet
from geopandas.io.arrow import _read_feather as read_feather
from geopandas.io.sql import _read_postgis as read_postgis
from geopandas.tools import sjoin, sjoin_nearest
from geopandas.tools import overlay
from geopandas.tools._show_versions import show_versions
from geopandas.tools import clip
import geopandas.datasets
# make the interactive namespace easier to use
# for `from geopandas import *` demos.
import geopandas as gpd
import pandas as pd
import numpy as np
from . import _version
__version__ = _version.get_versions()["version"]

View File

@@ -1,92 +0,0 @@
import importlib
from packaging.version import Version
import pandas as pd
import shapely
import shapely.geos
# -----------------------------------------------------------------------------
# pandas compat
# -----------------------------------------------------------------------------
PANDAS_GE_14 = Version(pd.__version__) >= Version("1.4.0rc0")
PANDAS_GE_15 = Version(pd.__version__) >= Version("1.5.0")
PANDAS_GE_20 = Version(pd.__version__) >= Version("2.0.0")
PANDAS_GE_202 = Version(pd.__version__) >= Version("2.0.2")
PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")
PANDAS_GE_22 = Version(pd.__version__) >= Version("2.2.0")
PANDAS_GE_30 = Version(pd.__version__) >= Version("3.0.0.dev0")
# -----------------------------------------------------------------------------
# Shapely / GEOS compat
# -----------------------------------------------------------------------------
SHAPELY_GE_204 = Version(shapely.__version__) >= Version("2.0.4")
GEOS_GE_390 = shapely.geos.geos_version >= (3, 9, 0)
GEOS_GE_310 = shapely.geos.geos_version >= (3, 10, 0)
def import_optional_dependency(name: str, extra: str = ""):
"""
Import an optional dependency.
Adapted from pandas.compat._optional::import_optional_dependency
Raises a formatted ImportError if the module is not present.
Parameters
----------
name : str
The module name.
extra : str
Additional text to include in the ImportError message.
Returns
-------
module
"""
msg = """Missing optional dependency '{name}'. {extra} "
"Use pip or conda to install {name}.""".format(
name=name, extra=extra
)
if not isinstance(name, str):
raise ValueError(
"Invalid module name: '{name}'; must be a string".format(name=name)
)
try:
module = importlib.import_module(name)
except ImportError:
raise ImportError(msg) from None
return module
# -----------------------------------------------------------------------------
# pyproj compat
# -----------------------------------------------------------------------------
try:
import pyproj # noqa: F401
HAS_PYPROJ = True
except ImportError as err:
HAS_PYPROJ = False
pyproj_import_error = str(err)
def requires_pyproj(func):
def wrapper(*args, **kwargs):
if not HAS_PYPROJ:
raise ImportError(
f"The 'pyproj' package is required for {func.__name__} to work. "
"Install it and initialize the object with a CRS before using it."
f"\nImporting pyproj resulted in: {pyproj_import_error}"
)
return func(*args, **kwargs)
return wrapper

View File

@@ -1,133 +0,0 @@
"""
Lightweight options machinery.
Based on https://github.com/topper-123/optioneer, but simplified (don't deal
with nested options, deprecated options, ..), just the attribute-style dict
like holding the options and giving a nice repr.
"""
import textwrap
import warnings
from collections import namedtuple
Option = namedtuple("Option", "key default_value doc validator callback")
class Options(object):
"""Provide attribute-style access to configuration dict."""
def __init__(self, options):
super().__setattr__("_options", options)
# populate with default values
config = {}
for key, option in options.items():
config[key] = option.default_value
super().__setattr__("_config", config)
def __setattr__(self, key, value):
# you can't set new keys
if key in self._config:
option = self._options[key]
if option.validator:
option.validator(value)
self._config[key] = value
if option.callback:
option.callback(key, value)
else:
msg = "You can only set the value of existing options"
raise AttributeError(msg)
def __getattr__(self, key):
try:
return self._config[key]
except KeyError:
raise AttributeError("No such option")
def __dir__(self):
return list(self._config.keys())
def __repr__(self):
cls = self.__class__.__name__
description = ""
for key, option in self._options.items():
descr = "{key}: {cur!r} [default: {default!r}]\n".format(
key=key, cur=self._config[key], default=option.default_value
)
description += descr
if option.doc:
doc_text = "\n".join(textwrap.wrap(option.doc, width=70))
else:
doc_text = "No description available."
doc_text = textwrap.indent(doc_text, prefix=" ")
description += doc_text + "\n"
space = "\n "
description = description.replace("\n", space)
return "{}({}{})".format(cls, space, description)
def _validate_display_precision(value):
if value is not None:
if not isinstance(value, int) or not (0 <= value <= 16):
raise ValueError("Invalid value, needs to be an integer [0-16]")
display_precision = Option(
key="display_precision",
default_value=None,
doc=(
"The precision (maximum number of decimals) of the coordinates in "
"the WKT representation in the Series/DataFrame display. "
"By default (None), it tries to infer and use 3 decimals for projected "
"coordinates and 5 decimals for geographic coordinates."
),
validator=_validate_display_precision,
callback=None,
)
def _warn_use_pygeos_deprecated(_value):
warnings.warn(
"pygeos support was removed in 1.0. "
"geopandas.use_pygeos is a no-op and will be removed in geopandas 1.1.",
stacklevel=3,
)
def _validate_io_engine(value):
if value is not None:
if value not in ("pyogrio", "fiona"):
raise ValueError(f"Expected 'pyogrio' or 'fiona', got '{value}'")
io_engine = Option(
key="io_engine",
default_value=None,
doc=(
"The default engine for ``read_file`` and ``to_file``. "
"Options are 'pyogrio' and 'fiona'."
),
validator=_validate_io_engine,
callback=None,
)
# TODO: deprecate this
use_pygeos = Option(
key="use_pygeos",
default_value=False,
doc=(
"Deprecated option previously used to enable PyGEOS. "
"It will be removed in GeoPandas 1.1."
),
validator=_warn_use_pygeos_deprecated,
callback=None,
)
options = Options(
{
"display_precision": display_precision,
"use_pygeos": use_pygeos,
"io_engine": io_engine,
}
)

View File

@@ -1,52 +0,0 @@
from textwrap import dedent
from typing import Callable, Union
# doc decorator function ported with modifications from Pandas
# https://github.com/pandas-dev/pandas/blob/master/pandas/util/_decorators.py
def doc(*docstrings: Union[str, Callable], **params) -> Callable:
"""
A decorator take docstring templates, concatenate them and perform string
substitution on it.
This decorator will add a variable "_docstring_components" to the wrapped
callable to keep track the original docstring template for potential usage.
If it should be consider as a template, it will be saved as a string.
Otherwise, it will be saved as callable, and later user __doc__ and dedent
to get docstring.
Parameters
----------
*docstrings : str or callable
The string / docstring / docstring template to be appended in order
after default docstring under callable.
**params
The string which would be used to format docstring template.
"""
def decorator(decorated: Callable) -> Callable:
# collecting docstring and docstring templates
docstring_components: list[Union[str, Callable]] = []
if decorated.__doc__:
docstring_components.append(dedent(decorated.__doc__))
for docstring in docstrings:
if hasattr(docstring, "_docstring_components"):
docstring_components.extend(docstring._docstring_components)
elif isinstance(docstring, str) or docstring.__doc__:
docstring_components.append(docstring)
# formatting templates and concatenating docstring
decorated.__doc__ = "".join(
(
component.format(**params)
if isinstance(component, str)
else dedent(component.__doc__ or "")
)
for component in docstring_components
)
decorated._docstring_components = docstring_components
return decorated
return decorator

View File

@@ -1,21 +0,0 @@
# This file was generated by 'versioneer.py' (0.29) from
# revision-control system data, or from the parent directory name of an
# unpacked source archive. Distribution tarballs contain a pre-generated copy
# of this file.
import json
version_json = '''
{
"date": "2024-07-02T14:23:16+0200",
"dirty": false,
"error": null,
"full-revisionid": "747d66ee6fcf00b819c08f11ecded53736c4652b",
"version": "1.0.1"
}
''' # END VERSION_JSON
def get_versions():
return json.loads(version_json)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,47 +0,0 @@
import os.path
import geopandas
import pytest
from geopandas.tests.util import _NATURALEARTH_CITIES, _NATURALEARTH_LOWRES, _NYBB
@pytest.fixture(autouse=True)
def add_geopandas(doctest_namespace):
doctest_namespace["geopandas"] = geopandas
# Datasets used in our tests
@pytest.fixture(scope="session")
def naturalearth_lowres() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NATURALEARTH_LOWRES) or os.getenv("GITHUB_ACTIONS"):
return _NATURALEARTH_LOWRES
else:
pytest.skip("Naturalearth lowres dataset not found")
@pytest.fixture(scope="session")
def naturalearth_cities() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NATURALEARTH_CITIES) or os.getenv("GITHUB_ACTIONS"):
return _NATURALEARTH_CITIES
else:
pytest.skip("Naturalearth cities dataset not found")
@pytest.fixture(scope="session")
def nybb_filename() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NYBB[len("zip://") :]) or os.getenv("GITHUB_ACTIONS"):
return _NYBB
else:
pytest.skip("NYBB dataset not found")
@pytest.fixture(scope="class")
def _setup_class_nybb_filename(nybb_filename, request):
"""Attach nybb_filename class attribute for unittest style setup_method"""
request.cls.nybb_filename = nybb_filename

View File

@@ -1,25 +0,0 @@
__all__ = []
available = [] # previously part of __all__
_prev_available = ["naturalearth_cities", "naturalearth_lowres", "nybb"]
def get_path(dataset):
ne_message = "https://www.naturalearthdata.com/downloads/110m-cultural-vectors/."
nybb_message = (
"the geodatasets package.\n\nfrom geodatasets import get_path\n"
"path_to_file = get_path('nybb')\n"
)
error_msg = (
"The geopandas.dataset has been deprecated and was removed in GeoPandas "
f"1.0. You can get the original '{dataset}' data from "
f"{ne_message if 'natural' in dataset else nybb_message}"
)
if dataset in _prev_available:
raise AttributeError(error_msg)
else:
error_msg = (
"The geopandas.dataset has been deprecated and "
"was removed in GeoPandas 1.0. New sample datasets are now available "
"in the geodatasets package (https://geodatasets.readthedocs.io/en/latest/)"
)
raise AttributeError(error_msg)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,614 +0,0 @@
import json
from packaging.version import Version
from typing import Dict, Optional, Tuple
import numpy as np
import pandas as pd
import pyarrow as pa
from numpy.typing import NDArray
import shapely
from shapely import GeometryType
from geopandas import GeoDataFrame
from geopandas._compat import SHAPELY_GE_204
from geopandas.array import from_shapely, from_wkb
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
## GeoPandas -> GeoArrow
class ArrowTable:
"""
Wrapper class for Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.table(gdf.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_table):
self._pa_table = pa_table
def __arrow_c_stream__(self, requested_schema=None):
return self._pa_table.__arrow_c_stream__(requested_schema=requested_schema)
class GeoArrowArray:
"""
Wrapper class for a geometry array as Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_array/stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.array(ser.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_field, pa_array):
self._pa_array = pa_array
self._pa_field = pa_field
def __arrow_c_array__(self, requested_schema=None):
if requested_schema is not None:
raise NotImplementedError(
"Requested schema is not supported for geometry arrays"
)
return (
self._pa_field.__arrow_c_schema__(),
self._pa_array.__arrow_c_array__()[1],
)
def geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
interleaved=True,
include_z=None,
):
"""
Convert GeoDataFrame to a pyarrow.Table.
Parameters
----------
df : GeoDataFrame
The GeoDataFrame to convert.
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
geometry_encoding : {'WKB', 'geoarrow' }, default 'WKB'
The GeoArrow encoding to use for the data conversion.
interleaved : bool, default True
Only relevant for 'geoarrow' encoding. If True, the geometries'
coordinates are interleaved in a single fixed size list array.
If False, the coordinates are stored as separate arrays in a
struct type.
include_z : bool, default None
Only relevant for 'geoarrow' encoding (for WKB, the dimensionality
of the individial geometries is preserved).
If False, return 2D geometries. If True, include the third dimension
in the output (if a geometry has no third dimension, the z-coordinates
will be NaN). By default, will infer the dimensionality from the
input geometries. Note that this inference can be unreliable with
empty geometries (for a guaranteed result, it is recommended to
specify the keyword).
"""
mask = df.dtypes == "geometry"
geometry_columns = df.columns[mask]
geometry_indices = np.asarray(mask).nonzero()[0]
df_attr = pd.DataFrame(df.copy(deep=False))
# replace geometry columns with dummy values -> will get converted to
# Arrow null column (not holding any memory), so we can afterwards
# fill the resulting table with the correct geometry fields
for col in geometry_columns:
df_attr[col] = None
table = pa.Table.from_pandas(df_attr, preserve_index=index)
geometry_encoding_dict = {}
if geometry_encoding.lower() == "geoarrow":
if Version(pa.__version__) < Version("10.0.0"):
raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
# Encode all geometry columns to GeoArrow
for i, col in zip(geometry_indices, geometry_columns):
field, geom_arr = construct_geometry_array(
np.array(df[col].array),
include_z=include_z,
field_name=col,
crs=df[col].crs,
interleaved=interleaved,
)
table = table.set_column(i, field, geom_arr)
geometry_encoding_dict[col] = (
field.metadata[b"ARROW:extension:name"]
.decode()
.removeprefix("geoarrow.")
)
elif geometry_encoding.lower() == "wkb":
# Encode all geometry columns to WKB
for i, col in zip(geometry_indices, geometry_columns):
field, wkb_arr = construct_wkb_array(
np.asarray(df[col].array), field_name=col, crs=df[col].crs
)
table = table.set_column(i, field, wkb_arr)
geometry_encoding_dict[col] = "WKB"
else:
raise ValueError(
f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
)
return table, geometry_encoding_dict
def construct_wkb_array(
shapely_arr: NDArray[np.object_],
*,
field_name: str = "geometry",
crs: Optional[str] = None,
) -> Tuple[pa.Field, pa.Array]:
if shapely.geos_version > (3, 10, 0):
kwargs = {"flavor": "iso"}
else:
if shapely.has_z(shapely_arr).any():
raise ValueError("Cannot write 3D geometries with GEOS<3.10")
kwargs = {}
wkb_arr = shapely.to_wkb(shapely_arr, **kwargs)
extension_metadata = {"ARROW:extension:name": "geoarrow.wkb"}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
field = pa.field(
field_name, type=pa.binary(), nullable=True, metadata=extension_metadata
)
parr = pa.array(np.asarray(wkb_arr), pa.binary())
return field, parr
def _convert_inner_coords(coords, interleaved, dims, mask=None):
if interleaved:
coords_field = pa.field(dims, pa.float64(), nullable=False)
typ = pa.list_(coords_field, len(dims))
if mask is None:
# mask keyword only added in pyarrow 15.0.0
parr = pa.FixedSizeListArray.from_arrays(coords.ravel(), type=typ)
else:
parr = pa.FixedSizeListArray.from_arrays(
coords.ravel(), type=typ, mask=mask
)
else:
if dims == "xy":
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy()], fields=fields, mask=mask
)
else:
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
pa.field("z", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy(), coords[:, 2].copy()],
fields=fields,
mask=mask,
)
return parr
def _linestring_type(point_type):
return pa.list_(pa.field("vertices", point_type, nullable=False))
def _polygon_type(point_type):
return pa.list_(
pa.field(
"rings",
pa.list_(pa.field("vertices", point_type, nullable=False)),
nullable=False,
)
)
def _multipoint_type(point_type):
return pa.list_(pa.field("points", point_type, nullable=False))
def _multilinestring_type(point_type):
return pa.list_(
pa.field("linestrings", _linestring_type(point_type), nullable=False)
)
def _multipolygon_type(point_type):
return pa.list_(pa.field("polygons", _polygon_type(point_type), nullable=False))
def construct_geometry_array(
shapely_arr: NDArray[np.object_],
include_z: Optional[bool] = None,
*,
field_name: str = "geometry",
crs: Optional[str] = None,
interleaved: bool = True,
) -> Tuple[pa.Field, pa.Array]:
# NOTE: this implementation returns a (field, array) pair so that it can set the
# extension metadata on the field without instantiating extension types into the
# global pyarrow registry
geom_type, coords, offsets = shapely.to_ragged_array(
shapely_arr, include_z=include_z
)
mask = shapely.is_missing(shapely_arr)
if mask.any():
if (
geom_type == GeometryType.POINT
and interleaved
and Version(pa.__version__) < Version("15.0.0")
):
raise ValueError(
"Converting point geometries with missing values is not supported "
"for interleaved coordinates with pyarrow < 15.0.0. Please "
"upgrade to a newer version of pyarrow."
)
mask = pa.array(mask, type=pa.bool_())
if geom_type == GeometryType.POINT and not SHAPELY_GE_204:
# bug in shapely < 2.0.4, see https://github.com/shapely/shapely/pull/2034
# this workaround only works if there are no empty points
indices = np.nonzero(mask)[0]
indices = indices - np.arange(len(indices))
coords = np.insert(coords, indices, np.nan, axis=0)
else:
mask = None
if coords.shape[-1] == 2:
dims = "xy"
elif coords.shape[-1] == 3:
dims = "xyz"
else:
raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
extension_metadata: Dict[str, str] = {}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
if geom_type == GeometryType.POINT:
parr = _convert_inner_coords(coords, interleaved, dims, mask=mask)
extension_metadata["ARROW:extension:name"] = "geoarrow.point"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.LINESTRING:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, _linestring_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.POLYGON:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_polygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOINT:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, type=_multipoint_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTILINESTRING:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_multilinestring_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOLYGON:
assert len(offsets) == 3, "Expected three offsets arrays"
ring_offsets, polygon_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
_parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2, mask=mask)
parr = parr.cast(_multipolygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
else:
raise ValueError(f"Unsupported type for geoarrow: {geom_type}")
## GeoArrow -> GeoPandas
def _get_arrow_geometry_field(field):
if (meta := field.metadata) is not None:
if (ext_name := meta.get(b"ARROW:extension:name", None)) is not None:
if ext_name.startswith(b"geoarrow."):
if (
ext_meta := meta.get(b"ARROW:extension:metadata", None)
) is not None:
ext_meta = json.loads(ext_meta.decode())
return ext_name.decode(), ext_meta
if isinstance(field.type, pa.ExtensionType):
ext_name = field.type.extension_name
if ext_name.startswith("geoarrow."):
ext_meta_ser = field.type.__arrow_ext_serialize__()
if ext_meta_ser:
ext_meta = json.loads(ext_meta_ser.decode())
else:
ext_meta = None
return ext_name, ext_meta
return None
def arrow_to_geopandas(table, geometry=None):
"""
Convert Arrow table object to a GeoDataFrame based on GeoArrow extension types.
Parameters
----------
table : pyarrow.Table
The Arrow table to convert.
geometry : str, default None
The name of the geometry column to set as the active geometry
column. If None, the first geometry column found will be used.
Returns
-------
GeoDataFrame
"""
if not isinstance(table, pa.Table):
table = pa.table(table)
geom_fields = []
for i, field in enumerate(table.schema):
geom = _get_arrow_geometry_field(field)
if geom is not None:
geom_fields.append((i, field.name, *geom))
if len(geom_fields) == 0:
raise ValueError("No geometry column found in the Arrow table.")
table_attr = table.drop([f[1] for f in geom_fields])
df = table_attr.to_pandas()
for i, col, ext_name, ext_meta in geom_fields:
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(
construct_shapely_array(table[col].combine_chunks(), ext_name), crs=crs
)
else:
raise TypeError(f"Unknown GeoArrow extension type: {ext_name}")
df.insert(i, col, geom_arr)
return GeoDataFrame(df, geometry=geometry or geom_fields[0][1])
def arrow_to_geometry_array(arr):
"""
Convert Arrow array object (representing single GeoArrow array) to a
geopandas GeometryArray.
Specifically for GeoSeries.from_arrow.
"""
if Version(pa.__version__) < Version("14.0.0"):
raise ValueError("Importing from Arrow requires pyarrow >= 14.0.")
schema_capsule, array_capsule = arr.__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
pa_arr = pa.Array._import_from_c_capsule(field.__arrow_c_schema__(), array_capsule)
geom_info = _get_arrow_geometry_field(field)
if geom_info is None:
raise ValueError("No GeoArrow geometry field found.")
ext_name, ext_meta = geom_info
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(pa_arr), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(construct_shapely_array(pa_arr, ext_name), crs=crs)
else:
raise ValueError(f"Unknown GeoArrow extension type: {ext_name}")
return geom_arr
def _get_inner_coords(arr):
if pa.types.is_struct(arr.type):
if arr.type.num_fields == 2:
coords = np.column_stack(
[np.asarray(arr.field("x")), np.asarray(arr.field("y"))]
)
else:
coords = np.column_stack(
[
np.asarray(arr.field("x")),
np.asarray(arr.field("y")),
np.asarray(arr.field("z")),
]
)
return coords
else:
# fixed size list
return np.asarray(arr.values).reshape(len(arr), -1)
def construct_shapely_array(arr: pa.Array, extension_name: str):
"""
Construct a NumPy array of shapely geometries from a pyarrow.Array
with GeoArrow extension type.
"""
if isinstance(arr, pa.ExtensionArray):
arr = arr.storage
if extension_name == "geoarrow.point":
coords = _get_inner_coords(arr)
result = shapely.from_ragged_array(GeometryType.POINT, coords, None)
elif extension_name == "geoarrow.linestring":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.LINESTRING, coords, offsets)
elif extension_name == "geoarrow.polygon":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(GeometryType.POLYGON, coords, offsets)
elif extension_name == "geoarrow.multipoint":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.MULTIPOINT, coords, offsets)
elif extension_name == "geoarrow.multilinestring":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(
GeometryType.MULTILINESTRING, coords, offsets
)
elif extension_name == "geoarrow.multipolygon":
coords = _get_inner_coords(arr.values.values.values)
offsets3 = np.asarray(arr.offsets)
offsets2 = np.asarray(arr.values.offsets)
offsets1 = np.asarray(arr.values.values.offsets)
offsets = (offsets1, offsets2, offsets3)
result = shapely.from_ragged_array(GeometryType.MULTIPOLYGON, coords, offsets)
else:
raise ValueError(extension_name)
# apply validity mask
if arr.null_count:
mask = np.asarray(arr.is_null())
result = np.where(mask, None, result)
return result

View File

@@ -1,72 +0,0 @@
from packaging.version import Version
import pyarrow
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
for more details.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if Version(pyarrow.__version__) >= Version("14.0.1"):
return
# if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
# installed, use this instead (which also ensures it works if they had
# called `pyarrow_hotfix.uninstall()`)
try:
import pyarrow_hotfix # noqa: F401
except ImportError:
pass
else:
return
# if the hotfix is already installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

View File

@@ -1,913 +0,0 @@
import json
import warnings
from packaging.version import Version
import numpy as np
from pandas import DataFrame, Series
import shapely
import geopandas
from geopandas import GeoDataFrame
from geopandas._compat import import_optional_dependency
from geopandas.array import from_shapely, from_wkb
from .file import _expand_user
METADATA_VERSION = "1.0.0"
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
# reference: https://github.com/opengeospatial/geoparquet
# Metadata structure:
# {
# "geo": {
# "columns": {
# "<name>": {
# "encoding": "WKB"
# "geometry_types": <list of str: REQUIRED>
# "crs": "<PROJJSON or None: OPTIONAL>",
# "orientation": "<'counterclockwise' or None: OPTIONAL>"
# "edges": "planar"
# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
# "epoch": <float: OPTIONAL>
# }
# },
# "primary_column": "<str: REQUIRED>",
# "version": "<METADATA_VERSION>",
#
# # Additional GeoPandas specific metadata (not in metadata spec)
# "creator": {
# "library": "geopandas",
# "version": "<geopandas.__version__>"
# }
# }
# }
def _is_fsspec_url(url):
return (
isinstance(url, str)
and "://" in url
and not url.startswith(("http://", "https://"))
)
def _remove_id_from_member_of_ensembles(json_dict):
"""
Older PROJ versions will not recognize IDs of datum ensemble members that
were added in more recent PROJ database versions.
Cf https://github.com/opengeospatial/geoparquet/discussions/110
and https://github.com/OSGeo/PROJ/pull/3221
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
"""
for key, value in json_dict.items():
if isinstance(value, dict):
_remove_id_from_member_of_ensembles(value)
elif key == "members" and isinstance(value, list):
for member in value:
member.pop("id", None)
# type ids 0 to 7
_geometry_type_names = [
"Point",
"LineString",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
]
_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
def _get_geometry_types(series):
"""
Get unique geometry types from a GeoSeries.
"""
arr_geometry_types = shapely.get_type_id(series.array._data)
# ensure to include "... Z" for 3D geometries
has_z = shapely.has_z(series.array._data)
arr_geometry_types[has_z] += 8
geometry_types = Series(arr_geometry_types).unique().tolist()
# drop missing values (shapely.get_type_id returns -1 for those)
if -1 in geometry_types:
geometry_types.remove(-1)
return sorted([_geometry_type_names[idx] for idx in geometry_types])
def _create_metadata(
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
):
"""Create and encode geo metadata dict.
Parameters
----------
df : GeoDataFrame
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
Returns
-------
dict
"""
if schema_version is None:
if geometry_encoding and any(
encoding != "WKB" for encoding in geometry_encoding.values()
):
schema_version = "1.1.0"
else:
schema_version = METADATA_VERSION
if schema_version not in SUPPORTED_VERSIONS:
raise ValueError(
f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
)
# Construct metadata for each geometry
column_metadata = {}
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
geometry_types = _get_geometry_types(series)
if schema_version[0] == "0":
geometry_types_name = "geometry_type"
if len(geometry_types) == 1:
geometry_types = geometry_types[0]
else:
geometry_types_name = "geometry_types"
crs = None
if series.crs:
if schema_version == "0.1.0":
crs = series.crs.to_wkt()
else: # version >= 0.4.0
crs = series.crs.to_json_dict()
_remove_id_from_member_of_ensembles(crs)
column_metadata[col] = {
"encoding": geometry_encoding[col],
"crs": crs,
geometry_types_name: geometry_types,
}
bbox = series.total_bounds.tolist()
if np.isfinite(bbox).all():
# don't add bbox with NaNs for empty / all-NA geometry column
column_metadata[col]["bbox"] = bbox
if write_covering_bbox:
column_metadata[col]["covering"] = {
"bbox": {
"xmin": ["bbox", "xmin"],
"ymin": ["bbox", "ymin"],
"xmax": ["bbox", "xmax"],
"ymax": ["bbox", "ymax"],
},
}
return {
"primary_column": df._geometry_column_name,
"columns": column_metadata,
"version": schema_version,
"creator": {"library": "geopandas", "version": geopandas.__version__},
}
def _encode_metadata(metadata):
"""Encode metadata dict to UTF-8 JSON string
Parameters
----------
metadata : dict
Returns
-------
UTF-8 encoded JSON string
"""
return json.dumps(metadata).encode("utf-8")
def _decode_metadata(metadata_str):
"""Decode a UTF-8 encoded JSON string to dict
Parameters
----------
metadata_str : string (UTF-8 encoded)
Returns
-------
dict
"""
if metadata_str is None:
return None
return json.loads(metadata_str.decode("utf-8"))
def _validate_dataframe(df):
"""Validate that the GeoDataFrame conforms to requirements for writing
to Parquet format.
Raises `ValueError` if the GeoDataFrame is not valid.
copied from `pandas.io.parquet`
Parameters
----------
df : GeoDataFrame
"""
if not isinstance(df, DataFrame):
raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
raise ValueError("Writing to Parquet/Feather requires string column names")
# index level names must be strings
valid_names = all(
isinstance(name, str) for name in df.index.names if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def _validate_geo_metadata(metadata):
"""Validate geo metadata.
Must not be empty, and must contain the structure specified above.
Raises ValueError if metadata is not valid.
Parameters
----------
metadata : dict
"""
if not metadata:
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
# version was schema_version in 0.1.0
version = metadata.get("version", metadata.get("schema_version"))
if not version:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'version'"
)
required_keys = ("primary_column", "columns")
for key in required_keys:
if metadata.get(key, None) is None:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'{key}'".format(key=key)
)
if not isinstance(metadata["columns"], dict):
raise ValueError("'columns' in 'geo' metadata must be a dict")
# Validate that geometry columns have required metadata and values
# leaving out "geometry_type" for compatibility with 0.1
required_col_keys = ("encoding",)
for col, column_metadata in metadata["columns"].items():
for key in required_col_keys:
if key not in column_metadata:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key "
"'{key}' for column '{col}'".format(key=key, col=col)
)
if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
raise ValueError(
"Only WKB geometry encoding or one of the native encodings "
f"({GEOARROW_ENCODINGS!r}) are supported, "
f"got: {column_metadata['encoding']}"
)
if column_metadata.get("edges", "planar") == "spherical":
warnings.warn(
f"The geo metadata indicate that column '{col}' has spherical edges, "
"but because GeoPandas currently does not support spherical "
"geometry, it ignores this metadata and will interpret the edges of "
"the geometries as planar.",
UserWarning,
stacklevel=4,
)
if "covering" in column_metadata:
covering = column_metadata["covering"]
if "bbox" in covering:
bbox = covering["bbox"]
for var in ["xmin", "ymin", "xmax", "ymax"]:
if var not in bbox.keys():
raise ValueError("Metadata for bbox column is malformed.")
def _geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=None,
):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
from pyarrow import StructArray
from geopandas.io._geoarrow import geopandas_to_arrow
_validate_dataframe(df)
if schema_version is not None:
if geometry_encoding != "WKB" and schema_version != "1.1.0":
raise ValueError(
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
)
table, geometry_encoding_dict = geopandas_to_arrow(
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
)
geo_metadata = _create_metadata(
df,
schema_version=schema_version,
geometry_encoding=geometry_encoding_dict,
write_covering_bbox=write_covering_bbox,
)
if write_covering_bbox:
if "bbox" in df.columns:
raise ValueError(
"An existing column 'bbox' already exists in the dataframe. "
"Please rename to write covering bbox."
)
bounds = df.bounds
bbox_array = StructArray.from_arrays(
[bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
names=["xmin", "ymin", "xmax", "ymax"],
)
table = table.append_column("bbox", bbox_array)
# Store geopandas specific file-level metadata
# This must be done AFTER creating the table or it is not persisted
metadata = table.schema.metadata
metadata.update({b"geo": _encode_metadata(geo_metadata)})
return table.replace_schema_metadata(metadata)
def _to_parquet(
df,
path,
index=None,
compression="snappy",
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=False,
**kwargs,
):
"""
Write a GeoDataFrame to the Parquet format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow'.
This is tracking version 1.0.0 of the GeoParquet specification at:
https://github.com/opengeospatial/geoparquet. Writing older versions is
supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
The encoding to use for the geometry columns. Defaults to "WKB"
for maximum interoperability. Specify "geoarrow" to use one of the
native GeoArrow-based single-geometry type encodings.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
**kwargs
Additional keyword arguments passed to pyarrow.parquet.write_table().
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
path = _expand_user(path)
table = _geopandas_to_arrow(
df,
index=index,
geometry_encoding=geometry_encoding,
schema_version=schema_version,
write_covering_bbox=write_covering_bbox,
)
parquet.write_table(table, path, compression=compression, **kwargs)
def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
"""
Write a GeoDataFrame to the Feather format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow' >= 0.17.
This is tracking version 1.0.0 of the GeoParquet specification for
the metadata at: https://github.com/opengeospatial/geoparquet. Writing
older versions is supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'zstd', 'lz4', 'uncompressed'}, optional
Name of the compression to use. Use ``"uncompressed"`` for no
compression. By default uses LZ4 if available, otherwise uncompressed.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version for the metadata; if not provided
will default to latest supported version.
kwargs
Additional keyword arguments passed to pyarrow.feather.write_feather().
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
feather.write_feather(table, path, compression=compression, **kwargs)
def _arrow_to_geopandas(table, geo_metadata=None):
"""
Helper function with main, shared logic for read_parquet/read_feather.
"""
if geo_metadata is None:
# Note: this path of not passing metadata is also used by dask-geopandas
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
# Find all geometry columns that were read from the file. May
# be a subset if 'columns' parameter is used.
geometry_columns = [
col for col in geo_metadata["columns"] if col in table.column_names
]
result_column_names = list(table.slice(0, 0).to_pandas().columns)
geometry_columns.sort(key=result_column_names.index)
if not len(geometry_columns):
raise ValueError(
"""No geometry columns are included in the columns read from
the Parquet/Feather file. To read this file without geometry columns,
use pandas.read_parquet/read_feather() instead."""
)
geometry = geo_metadata["primary_column"]
# Missing geometry likely indicates a subset of columns was read;
# promote the first available geometry to the primary geometry.
if len(geometry_columns) and geometry not in geometry_columns:
geometry = geometry_columns[0]
# if there are multiple non-primary geometry columns, raise a warning
if len(geometry_columns) > 1:
warnings.warn(
"Multiple non-primary geometry columns read from Parquet/Feather "
"file. The first column read was promoted to the primary geometry.",
stacklevel=3,
)
table_attr = table.drop(geometry_columns)
df = table_attr.to_pandas()
# Convert the WKB columns that are present back to geometry.
for col in geometry_columns:
col_metadata = geo_metadata["columns"][col]
if "crs" in col_metadata:
crs = col_metadata["crs"]
if isinstance(crs, dict):
_remove_id_from_member_of_ensembles(crs)
else:
# per the GeoParquet spec, missing CRS is to be interpreted as
# OGC:CRS84
crs = "OGC:CRS84"
if col_metadata["encoding"] == "WKB":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
else:
from geopandas.io._geoarrow import construct_shapely_array
geom_arr = from_shapely(
construct_shapely_array(
table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
),
crs=crs,
)
df.insert(result_column_names.index(col), col, geom_arr)
return GeoDataFrame(df, geometry=geometry)
def _get_filesystem_path(path, filesystem=None, storage_options=None):
"""
Get the filesystem and path for a given filesystem and path.
If the filesystem is not None then it's just returned as is.
"""
import pyarrow
if (
isinstance(path, str)
and storage_options is None
and filesystem is None
and Version(pyarrow.__version__) >= Version("5.0.0")
):
# Use the native pyarrow filesystem if possible.
try:
from pyarrow.fs import FileSystem
filesystem, path = FileSystem.from_uri(path)
except Exception:
# fallback to use get_handle / fsspec for filesystems
# that pyarrow doesn't support
pass
if _is_fsspec_url(path) and filesystem is None:
fsspec = import_optional_dependency(
"fsspec", extra="fsspec is requred for 'storage_options'."
)
filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
if filesystem is None and storage_options:
raise ValueError(
"Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
)
return filesystem, path
def _ensure_arrow_fs(filesystem):
"""
Simplified version of pyarrow.fs._ensure_filesystem. This is only needed
below because `pyarrow.parquet.read_metadata` does not yet accept a
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
"""
from pyarrow import fs
if isinstance(filesystem, fs.FileSystem):
return filesystem
# handle fsspec-compatible filesystems
try:
import fsspec
except ImportError:
pass
else:
if isinstance(filesystem, fsspec.AbstractFileSystem):
return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
return filesystem
def _validate_and_decode_metadata(metadata):
if metadata is None or b"geo" not in metadata:
raise ValueError(
"""Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead."""
)
# check for malformed metadata
try:
decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
except (TypeError, json.decoder.JSONDecodeError):
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
_validate_geo_metadata(decoded_geo_metadata)
return decoded_geo_metadata
def _read_parquet_schema_and_metadata(path, filesystem):
"""
Opening the Parquet file/dataset a first time to get the schema and metadata.
TODO: we should look into how we can reuse opened dataset for reading the
actual data, to avoid discovering the dataset twice (problem right now is
that the ParquetDataset interface doesn't allow passing the filters on read)
"""
import pyarrow
from pyarrow import parquet
kwargs = {}
if Version(pyarrow.__version__) < Version("15.0.0"):
kwargs = dict(use_legacy_dataset=False)
try:
schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
except Exception:
schema = parquet.read_schema(path, filesystem=filesystem)
metadata = schema.metadata
# read metadata separately to get the raw Parquet FileMetaData metadata
# (pyarrow doesn't properly exposes those in schema.metadata for files
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
if metadata is None or b"geo" not in metadata:
try:
metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
except Exception:
pass
return schema, metadata
def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
"""
Load a Parquet object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_parquet` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the GeoParquet metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow'.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g. host,
port, username, password, etc. For HTTP(S) URLs the key-value pairs are
forwarded to urllib as header options. For other URLs (e.g. starting with
"s3://", and "gcs://") the key-value pairs are forwarded to fsspec. Please
see fsspec and urllib for more details.
When no storage options are provided and a filesystem is implemented by
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
filesystem is preferred. Provide the instantiated fsspec filesystem using
the ``filesystem`` keyword if you wish to use its implementation.
bbox : tuple, optional
Bounding box to be used to filter selection from geoparquet data. This
is only usable if the data was saved with the bbox covering metadata.
Input is of the tuple format (xmin, ymin, xmax, ymax).
**kwargs
Any additional kwargs passed to :func:`pyarrow.parquet.read_table`.
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_parquet("data.parquet") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_parquet(
... "data.parquet",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
import geopandas.io._pyarrow_hotfix # noqa: F401
# TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
# adds filesystem as a keyword and match that.
filesystem = kwargs.pop("filesystem", None)
filesystem, path = _get_filesystem_path(
path, filesystem=filesystem, storage_options=storage_options
)
path = _expand_user(path)
schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
geo_metadata = _validate_and_decode_metadata(metadata)
bbox_filter = (
_get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
)
if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
# by default, bbox column is not read in, so must specify which
# columns are read in if it exists.
if not columns and if_bbox_column_exists:
columns = _get_non_bbox_columns(schema, geo_metadata)
# if both bbox and filters kwargs are used, must splice together.
if "filters" in kwargs:
filters_kwarg = kwargs.pop("filters")
filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
else:
filters = bbox_filter
kwargs["use_pandas_metadata"] = True
table = parquet.read_table(
path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
)
return _arrow_to_geopandas(table, geo_metadata)
def _read_feather(path, columns=None, **kwargs):
"""
Load a Feather object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_feather` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the Feather metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow' >= 0.17.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
**kwargs
Any additional kwargs passed to pyarrow.feather.read_table().
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_feather("data.feather") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_feather(
... "data.feather",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
import geopandas.io._pyarrow_hotfix # noqa: F401
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = feather.read_table(path, columns=columns, **kwargs)
return _arrow_to_geopandas(table)
def _get_parquet_bbox_filter(geo_metadata, bbox):
primary_column = geo_metadata["primary_column"]
if _check_if_covering_in_geo_metadata(geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
elif geo_metadata["columns"][primary_column]["encoding"] == "point":
import pyarrow.compute as pc
return (
(pc.field((primary_column, "x")) >= bbox[0])
& (pc.field((primary_column, "x")) <= bbox[2])
& (pc.field((primary_column, "y")) >= bbox[1])
& (pc.field((primary_column, "y")) <= bbox[3])
)
else:
raise ValueError(
"Specifying 'bbox' not supported for this Parquet file (it should either "
"have a bbox covering column or use 'point' encoding)."
)
def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
import pyarrow.compute as pc
return ~(
(pc.field((bbox_column_name, "xmin")) > bbox[2])
| (pc.field((bbox_column_name, "ymin")) > bbox[3])
| (pc.field((bbox_column_name, "xmax")) < bbox[0])
| (pc.field((bbox_column_name, "ymax")) < bbox[1])
)
def _check_if_covering_in_geo_metadata(geo_metadata):
primary_column = geo_metadata["primary_column"]
return "covering" in geo_metadata["columns"][primary_column].keys()
def _get_bbox_encoding_column_name(geo_metadata):
primary_column = geo_metadata["primary_column"]
return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
def _get_non_bbox_columns(schema, geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
columns = schema.names
if bbox_column_name in columns:
columns.remove(bbox_column_name)
return columns
def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
if bbox_filter is None:
return kwarg_filters
filters_expression = parquet.filters_to_expression(kwarg_filters)
return bbox_filter & filters_expression

View File

@@ -1,851 +0,0 @@
from __future__ import annotations
import os
import urllib.request
import warnings
from io import IOBase
from packaging.version import Version
from pathlib import Path
# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import shapely
from shapely.geometry import mapping
from shapely.geometry.base import BaseGeometry
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
from geopandas.io.util import vsi_path
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
_VALID_URLS.discard("file")
fiona = None
fiona_env = None
fiona_import_error = None
FIONA_GE_19 = False
def _import_fiona():
global fiona
global fiona_env
global fiona_import_error
global FIONA_GE_19
if fiona is None:
try:
import fiona
# only try to import fiona.Env if the main fiona import succeeded
# (otherwise you can get confusing "AttributeError: module 'fiona'
# has no attribute '_loading'" / partially initialized module errors)
try:
from fiona import Env as fiona_env
except ImportError:
try:
from fiona import drivers as fiona_env
except ImportError:
fiona_env = None
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
"1.9.0"
)
except ImportError as err:
fiona = False
fiona_import_error = str(err)
pyogrio = None
pyogrio_import_error = None
def _import_pyogrio():
global pyogrio
global pyogrio_import_error
if pyogrio is None:
try:
import pyogrio
except ImportError as err:
pyogrio = False
pyogrio_import_error = str(err)
def _check_fiona(func):
if not fiona:
raise ImportError(
f"the {func} requires the 'fiona' package, but it is not installed or does "
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
)
def _check_pyogrio(func):
if not pyogrio:
raise ImportError(
f"the {func} requires the 'pyogrio' package, but it is not installed "
"or does not import correctly."
"\nImporting pyogrio resulted in: {pyogrio_import_error}"
)
def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
if metadata is None:
return
if driver != "GPKG":
raise NotImplementedError(
"The 'metadata' keyword is only supported for the GPKG driver."
)
if engine == "fiona" and not FIONA_GE_19:
raise NotImplementedError(
"The 'metadata' keyword is only supported for Fiona >= 1.9."
)
def _check_engine(engine, func):
# if not specified through keyword or option, then default to "pyogrio" if
# installed, otherwise try fiona
if engine is None:
import geopandas
engine = geopandas.options.io_engine
if engine is None:
_import_pyogrio()
if pyogrio:
engine = "pyogrio"
else:
_import_fiona()
if fiona:
engine = "fiona"
if engine == "pyogrio":
_import_pyogrio()
_check_pyogrio(func)
elif engine == "fiona":
_import_fiona()
_check_fiona(func)
elif engine is None:
raise ImportError(
f"The {func} requires the 'pyogrio' or 'fiona' package, "
"but neither is installed or imports correctly."
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
f"\nImporting fiona resulted in: {fiona_import_error}"
)
return engine
_EXTENSION_TO_DRIVER = {
".bna": "BNA",
".dxf": "DXF",
".csv": "CSV",
".shp": "ESRI Shapefile",
".dbf": "ESRI Shapefile",
".json": "GeoJSON",
".geojson": "GeoJSON",
".geojsonl": "GeoJSONSeq",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".gml": "GML",
".xml": "GML",
".gpx": "GPX",
".gtm": "GPSTrackMaker",
".gtz": "GPSTrackMaker",
".tab": "MapInfo File",
".mif": "MapInfo File",
".mid": "MapInfo File",
".dgn": "DGN",
".fgb": "FlatGeobuf",
}
def _expand_user(path):
"""Expand paths that use ~."""
if isinstance(path, str):
path = os.path.expanduser(path)
elif isinstance(path, Path):
path = path.expanduser()
return path
def _is_url(url):
"""Check to see if *url* has a valid protocol."""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def _read_file(
filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
):
"""
Returns a GeoDataFrame from a file or URL.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely
geometry. With engine="fiona", CRS mis-matches are resolved if given a GeoSeries
or GeoDataFrame. With engine="pyogrio", bbox must be in the same CRS as the
dataset. Tuple is (minx, miny, maxx, maxy) to match the bounds property of
shapely geometry objects. Cannot be used with mask.
mask : dict | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter for features that intersect with the given dict-like geojson
geometry, GeoSeries, GeoDataFrame or shapely geometry.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
Cannot be used with bbox. If multiple geometries are passed, this will
first union all geometries, which may be computationally expensive.
columns : list, optional
List of column names to import from the data source. Column names
must exactly match the names in the data source. To avoid reading
any columns (besides the geometry column), pass an empty list-like.
By default reads all columns.
rows : int or slice, default None
Load in specific rows by passing an integer (first `n` rows) or a
slice() object.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "pyogrio" engine, the keyword arguments are passed to
`pyogrio.write_dataframe`. In case of the "fiona" engine, the keyword
arguments are passed to fiona.open`. For more information on possible
keywords, type: ``import pyogrio; help(pyogrio.write_dataframe)``.
Examples
--------
>>> df = geopandas.read_file("nybb.shp") # doctest: +SKIP
Specifying layer of GPKG:
>>> df = geopandas.read_file("file.gpkg", layer='cities') # doctest: +SKIP
Reading only first 10 rows:
>>> df = geopandas.read_file("nybb.shp", rows=10) # doctest: +SKIP
Reading only geometries intersecting ``mask``:
>>> df = geopandas.read_file("nybb.shp", mask=polygon) # doctest: +SKIP
Reading only geometries intersecting ``bbox``:
>>> df = geopandas.read_file("nybb.shp", bbox=(0, 0, 10, 20)) # doctest: +SKIP
Returns
-------
:obj:`geopandas.GeoDataFrame` or :obj:`pandas.DataFrame` :
If `ignore_geometry=True` a :obj:`pandas.DataFrame` will be returned.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
When specifying a URL, geopandas will check if the server supports reading
partial data and in that case pass the URL as is to the underlying engine,
which will then use the network file system handler of GDAL to read from
the URL. Otherwise geopandas will download the data from the URL and pass
all data in-memory to the underlying engine.
If you need more control over how the URL is read, you can specify the
GDAL virtual filesystem manually (e.g. ``/vsicurl/https://...``). See the
GDAL documentation on filesystems for more details
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
"""
engine = _check_engine(engine, "'read_file' function")
filename = _expand_user(filename)
from_bytes = False
if _is_url(filename):
# if it is a url that supports random access -> pass through to
# pyogrio/fiona as is (to support downloading only part of the file)
# otherwise still download manually because pyogrio/fiona don't support
# all types of urls (https://github.com/geopandas/geopandas/issues/2908)
with urllib.request.urlopen(filename) as response:
if not response.headers.get("Accept-Ranges") == "bytes":
filename = response.read()
from_bytes = True
if engine == "pyogrio":
return _read_file_pyogrio(
filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
)
elif engine == "fiona":
if pd.api.types.is_file_like(filename):
data = filename.read()
path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
from_bytes = True
else:
path_or_bytes = filename
return _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=bbox,
mask=mask,
columns=columns,
rows=rows,
**kwargs,
)
else:
raise ValueError(f"unknown engine '{engine}'")
def _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=None,
mask=None,
columns=None,
rows=None,
where=None,
**kwargs,
):
if where is not None and not FIONA_GE_19:
raise NotImplementedError("where requires fiona 1.9+")
if columns is not None:
if "include_fields" in kwargs:
raise ValueError(
"Cannot specify both 'include_fields' and 'columns' keywords"
)
if not FIONA_GE_19:
raise NotImplementedError("'columns' keyword requires fiona 1.9+")
kwargs["include_fields"] = columns
elif "include_fields" in kwargs:
# alias to columns, as this variable is used below to specify column order
# in the dataframe creation
columns = kwargs["include_fields"]
if not from_bytes:
# Opening a file via URL or file-like-object above automatically detects a
# zipped file. In order to match that behavior, attempt to add a zip scheme
# if missing.
path_or_bytes = vsi_path(str(path_or_bytes))
if from_bytes:
reader = fiona.BytesCollection
else:
reader = fiona.open
with fiona_env():
with reader(path_or_bytes, **kwargs) as features:
crs = features.crs_wkt
# attempt to get EPSG code
try:
# fiona 1.9+
epsg = features.crs.to_epsg(confidence_threshold=100)
if epsg is not None:
crs = epsg
except AttributeError:
# fiona <= 1.8
try:
crs = features.crs["init"]
except (TypeError, KeyError):
pass
# handle loading the bounding box
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
assert len(bbox) == 4
# handle loading the mask
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
mask = mapping(mask.to_crs(crs).union_all())
elif isinstance(mask, BaseGeometry):
mask = mapping(mask)
filters = {}
if bbox is not None:
filters["bbox"] = bbox
if mask is not None:
filters["mask"] = mask
if where is not None:
filters["where"] = where
# setup the data loading filter
if rows is not None:
if isinstance(rows, int):
rows = slice(rows)
elif not isinstance(rows, slice):
raise TypeError("'rows' must be an integer or a slice.")
f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
elif filters:
f_filt = features.filter(**filters)
else:
f_filt = features
# get list of columns
columns = columns or list(features.schema["properties"])
datetime_fields = [
k for (k, v) in features.schema["properties"].items() if v == "datetime"
]
if (
kwargs.get("ignore_geometry", False)
or features.schema["geometry"] == "None"
):
df = pd.DataFrame(
[record["properties"] for record in f_filt], columns=columns
)
else:
df = GeoDataFrame.from_features(
f_filt, crs=crs, columns=columns + ["geometry"]
)
for k in datetime_fields:
as_dt = None
# plain try catch for when pandas will raise in the future
# TODO we can tighten the exception type in future when it does
try:
with warnings.catch_warnings():
# pandas 2.x does not yet enforce this behaviour but raises a
# warning -> we want to to suppress this warning for our users,
# and do this by turning it into an error so we take the
# `except` code path to try again with utc=True
warnings.filterwarnings(
"error",
"In a future version of pandas, parsing datetimes with "
"mixed time zones will raise an error",
FutureWarning,
)
as_dt = pd.to_datetime(df[k])
except Exception:
pass
if as_dt is None or as_dt.dtype == "object":
# if to_datetime failed, try again for mixed timezone offsets
# This can still fail if there are invalid datetimes
try:
as_dt = pd.to_datetime(df[k], utc=True)
except Exception:
pass
# if to_datetime succeeded, round datetimes as
# fiona only supports up to ms precision (any microseconds are
# floating point rounding error)
if as_dt is not None and not (as_dt.dtype == "object"):
if PANDAS_GE_20:
df[k] = as_dt.dt.as_unit("ms")
else:
df[k] = as_dt.dt.round(freq="ms")
return df
def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
import pyogrio
if rows is not None:
if isinstance(rows, int):
kwargs["max_features"] = rows
elif isinstance(rows, slice):
if rows.start is not None:
if rows.start < 0:
raise ValueError(
"Negative slice start not supported with the 'pyogrio' engine."
)
kwargs["skip_features"] = rows.start
if rows.stop is not None:
kwargs["max_features"] = rows.stop - (rows.start or 0)
if rows.step is not None:
raise ValueError("slice with step is not supported")
else:
raise TypeError("'rows' must be an integer or a slice.")
if bbox is not None and mask is not None:
# match error message from Fiona
raise ValueError("mask and bbox can not be set together")
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
if len(bbox) != 4:
raise ValueError("'bbox' should be a length-4 tuple.")
if mask is not None:
# NOTE: mask cannot be used at same time as bbox keyword
if isinstance(mask, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
elif isinstance(mask, BaseGeometry):
mask = shapely.unary_union(mask)
elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
# convert GeoJSON to shapely geometry
mask = shapely.geometry.shape(mask)
kwargs["mask"] = mask
if kwargs.pop("ignore_geometry", False):
kwargs["read_geometry"] = False
# translate `ignore_fields`/`include_fields` keyword for back compat with fiona
if "ignore_fields" in kwargs and "include_fields" in kwargs:
raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
elif "ignore_fields" in kwargs:
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'ignore_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
ignore_fields = kwargs.pop("ignore_fields")
fields = pyogrio.read_info(path_or_bytes)["fields"]
include_fields = [col for col in fields if col not in ignore_fields]
kwargs["columns"] = include_fields
elif "include_fields" in kwargs:
# translate `include_fields` keyword for back compat with fiona engine
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'include_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
kwargs["columns"] = kwargs.pop("include_fields")
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
def _detect_driver(path):
"""
Attempt to auto-detect driver based on the extension
"""
try:
# in case the path is a file handle
path = path.name
except AttributeError:
pass
try:
return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
except KeyError:
# Assume it is a shapefile folder for now. In the future,
# will likely raise an exception when the expected
# folder writing behavior is more clearly defined.
return "ESRI Shapefile"
def _to_file(
df,
filename,
driver=None,
schema=None,
index=None,
mode="w",
crs=None,
engine=None,
metadata=None,
**kwargs,
):
"""
Write this GeoDataFrame to an OGR data source
A dictionary of supported OGR providers is available via:
>>> import pyogrio
>>> pyogrio.list_drivers() # doctest: +SKIP
Parameters
----------
df : GeoDataFrame to be written
filename : string
File path or file handle to write to. The path may specify a
GDAL VSI scheme.
driver : string, default None
The OGR format driver used to write the vector file.
If not specified, it attempts to infer it from the file extension.
If no extension is specified, it saves ESRI Shapefile to a folder.
schema : dict, default None
If specified, the schema dictionary is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the schema based on each column's dtype.
Not supported for the "pyogrio" engine.
index : bool, default None
If True, write index into one or more columns (for MultiIndex).
Default None writes the index into one or more columns only if
the index is named, is a MultiIndex, or has a non-integer data
type. If False, no index is written.
.. versionadded:: 0.7
Previously the index was not written.
mode : string, default 'w'
The write mode, 'w' to overwrite the existing file and 'a' to append;
when using the pyogrio engine, you can also pass ``append=True``.
Not all drivers support appending. For the fiona engine, the drivers
that support appending are listed in fiona.supported_drivers or
https://github.com/Toblerity/Fiona/blob/master/fiona/drvsupport.py.
For the pyogrio engine, you should be able to use any driver that
is available in your installation of GDAL that supports append
capability; see the specific driver entry at
https://gdal.org/drivers/vector/index.html for more information.
crs : pyproj.CRS, default None
If specified, the CRS is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the crs based on crs df attribute.
The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
metadata : dict[str, str], default None
Optional metadata to be stored in the file. Keys and values must be
strings. Only supported for the "GPKG" driver
(requires Fiona >= 1.9 or pyogrio >= 0.6).
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "fiona" engine, the keyword arguments are passed to
fiona.open`. For more information on possible keywords, type:
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
the keyword arguments are passed to `pyogrio.write_dataframe`.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
"""
engine = _check_engine(engine, "'to_file' method")
filename = _expand_user(filename)
if index is None:
# Determine if index attribute(s) should be saved to file
# (only if they are named or are non-integer)
index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
if index:
df = df.reset_index(drop=False)
if driver is None:
driver = _detect_driver(filename)
if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
warnings.warn(
"Column names longer than 10 characters will be truncated when saved to "
"ESRI Shapefile.",
stacklevel=3,
)
if (df.dtypes == "geometry").sum() > 1:
raise ValueError(
"GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
"supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
"GeoDataFrame.to_feather, drop additional geometry columns or convert them "
"to a supported format like a well-known text (WKT) using "
"`GeoSeries.to_wkt()`.",
)
_check_metadata_supported(metadata, engine, driver)
if mode not in ("w", "a"):
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
if engine == "pyogrio":
_to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
elif engine == "fiona":
_to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
else:
raise ValueError(f"unknown engine '{engine}'")
def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
if not HAS_PYPROJ and crs:
raise ImportError(
"The 'pyproj' package is required to write a file with a CRS, but it is not"
" installed or does not import correctly."
)
if schema is None:
schema = infer_schema(df)
if crs:
from pyproj import CRS
crs = CRS.from_user_input(crs)
else:
crs = df.crs
with fiona_env():
crs_wkt = None
try:
gdal_version = Version(
fiona.env.get_gdal_release_name().strip("e")
) # GH3147
except (AttributeError, ValueError):
gdal_version = Version("2.0.0") # just assume it is not the latest
if gdal_version >= Version("3.0.0") and crs:
crs_wkt = crs.to_wkt()
elif crs:
crs_wkt = crs.to_wkt("WKT1_GDAL")
with fiona.open(
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
) as colxn:
if metadata is not None:
colxn.update_tags(metadata)
colxn.writerecords(df.iterfeatures())
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
import pyogrio
if schema is not None:
raise ValueError(
"The 'schema' argument is not supported with the 'pyogrio' engine."
)
if mode == "a":
kwargs["append"] = True
if crs is not None:
raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
# for the fiona engine, this check is done in gdf.iterfeatures()
if not df.columns.is_unique:
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
def infer_schema(df):
from collections import OrderedDict
# TODO: test pandas string type and boolean type once released
types = {
"Int32": "int32",
"int32": "int32",
"Int64": "int",
"string": "str",
"boolean": "bool",
}
def convert_type(column, in_type):
if in_type == object:
return "str"
if in_type.name.startswith("datetime64"):
# numpy datetime type regardless of frequency
return "datetime"
if str(in_type) in types:
out_type = types[str(in_type)]
else:
out_type = type(np.zeros(1, in_type).item()).__name__
if out_type == "long":
out_type = "int"
return out_type
properties = OrderedDict(
[
(col, convert_type(col, _type))
for col, _type in zip(df.columns, df.dtypes)
if col != df._geometry_column_name
]
)
if df.empty:
warnings.warn(
"You are attempting to write an empty DataFrame to file. "
"For some drivers, this operation may fail.",
UserWarning,
stacklevel=3,
)
# Since https://github.com/Toblerity/Fiona/issues/446 resolution,
# Fiona allows a list of geometry types
geom_types = _geometry_types(df)
schema = {"geometry": geom_types, "properties": properties}
return schema
def _geometry_types(df):
"""
Determine the geometry types in the GeoDataFrame for the schema.
"""
geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
geom_types = geom_types_3D + geom_types_2D
if len(geom_types) == 0:
# Default geometry type supported by Fiona
# (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
return "Unknown"
if len(geom_types) == 1:
geom_types = geom_types[0]
return geom_types
def _list_layers(filename) -> pd.DataFrame:
"""List layers available in a file.
Provides an overview of layers available in a file or URL together with their
geometry types. When supported by the data source, this includes both spatial and
non-spatial layers. Non-spatial layers are indicated by the ``"geometry_type"``
column being ``None``. GeoPandas will not read such layers but they can be read into
a pd.DataFrame using :func:`pyogrio.read_dataframe`.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
Returns
-------
pandas.DataFrame
A DataFrame with columns "name" and "geometry_type" and one row per layer.
"""
_import_pyogrio()
_check_pyogrio("list_layers")
import pyogrio
return pd.DataFrame(
pyogrio.list_layers(filename), columns=["name", "geometry_type"]
)

View File

@@ -1,473 +0,0 @@
import warnings
from contextlib import contextmanager
from functools import lru_cache
import pandas as pd
import shapely
import shapely.wkb
from geopandas import GeoDataFrame
@contextmanager
def _get_conn(conn_or_engine):
"""
Yield a connection within a transaction context.
Engine.begin() returns a Connection with an implicit Transaction while
Connection.begin() returns the Transaction. This helper will always return a
Connection with an implicit (possibly nested) Transaction.
Parameters
----------
conn_or_engine : Connection or Engine
A sqlalchemy Connection or Engine instance
Returns
-------
Connection
"""
from sqlalchemy.engine.base import Connection, Engine
if isinstance(conn_or_engine, Connection):
if not conn_or_engine.in_transaction():
with conn_or_engine.begin():
yield conn_or_engine
else:
yield conn_or_engine
elif isinstance(conn_or_engine, Engine):
with conn_or_engine.begin() as conn:
yield conn
else:
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
"""
Transforms a pandas DataFrame into a GeoDataFrame.
The column 'geom_col' must be a geometry column in WKB representation.
To be used to convert df based on pd.read_sql to gdf.
Parameters
----------
df : DataFrame
pandas DataFrame with geometry column in WKB representation.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : pyproj.CRS, optional
CRS to use for the returned GeoDataFrame. The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
If not set, tries to determine CRS from the SRID associated with the
first geometry in the database, and assigns that to all geometries.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
Returns
-------
GeoDataFrame
"""
if geom_col not in df:
raise ValueError("Query missing geometry column '{}'".format(geom_col))
if df.columns.to_list().count(geom_col) > 1:
raise ValueError(
f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
"one geometry column is allowed."
)
geoms = df[geom_col].dropna()
if not geoms.empty:
load_geom_bytes = shapely.wkb.loads
"""Load from Python 3 binary."""
def load_geom_text(x):
"""Load from binary encoded as text."""
return shapely.wkb.loads(str(x), hex=True)
if isinstance(geoms.iat[0], bytes):
load_geom = load_geom_bytes
else:
load_geom = load_geom_text
df[geom_col] = geoms = geoms.apply(load_geom)
if crs is None:
srid = shapely.get_srid(geoms.iat[0])
# if no defined SRID in geodatabase, returns SRID of 0
if srid != 0:
try:
spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
except pd.errors.DatabaseError:
warning_msg = (
f"Could not find the spatial reference system table "
f"(spatial_ref_sys) in PostGIS."
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
else:
if not spatial_ref_sys_df.empty:
auth_name = spatial_ref_sys_df["auth_name"].item()
crs = f"{auth_name}:{srid}"
else:
warning_msg = (
f"Could not find srid {srid} in the "
f"spatial_ref_sys table. "
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
return GeoDataFrame(df, crs=crs, geometry=geom_col)
def _read_postgis(
sql,
con,
geom_col="geom",
crs=None,
index_col=None,
coerce_float=True,
parse_dates=None,
params=None,
chunksize=None,
):
"""
Returns a GeoDataFrame corresponding to the result of the query
string, which must contain a geometry column in WKB representation.
It is also possible to use :meth:`~GeoDataFrame.read_file` to read from a database.
Especially for file geodatabases like GeoPackage or SpatiaLite this can be easier.
Parameters
----------
sql : string
SQL query to execute in selecting entries from database, or name
of the table to read from the database.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : dict or str, optional
CRS to use for the returned GeoDataFrame; if not set, tries to
determine CRS from the SRID associated with the first geometry in
the database, and assigns that to all geometries.
chunksize : int, default None
If specified, return an iterator where chunksize is the number of rows to
include in each chunk.
See the documentation for pandas.read_sql for further explanation
of the following parameters:
index_col, coerce_float, parse_dates, params, chunksize
Returns
-------
GeoDataFrame
Examples
--------
PostGIS
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> db_connection_url = "postgresql://myusername:mypassword@myhost:5432/mydatabase"
>>> con = create_engine(db_connection_url) # doctest: +SKIP
>>> sql = "SELECT geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
SpatiaLite
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
"""
if chunksize is None:
# read all in one chunk and return a single GeoDataFrame
df = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
else:
# read data in chunks and return a generator
df_generator = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return (
_df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
)
def _get_geometry_type(gdf):
"""
Get basic geometry type of a GeoDataFrame. See more info from:
https://geoalchemy-2.readthedocs.io/en/latest/types.html#geoalchemy2.types._GISType
Following rules apply:
- if geometries all share the same geometry-type,
geometries are inserted with the given GeometryType with following types:
- Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon,
GeometryCollection.
- LinearRing geometries will be converted into LineString -objects.
- in all other cases, geometries will be inserted with type GEOMETRY:
- a mix of Polygons and MultiPolygons in GeoSeries
- a mix of Points and LineStrings in GeoSeries
- geometry is of type GeometryCollection,
such as GeometryCollection([Point, LineStrings])
- if any of the geometries has Z-coordinate, all records will
be written with 3D.
"""
geom_types = list(gdf.geometry.geom_type.unique())
has_curve = False
for gt in geom_types:
if gt is None:
continue
elif "LinearRing" in gt:
has_curve = True
if len(geom_types) == 1:
if has_curve:
target_geom_type = "LINESTRING"
else:
if geom_types[0] is None:
raise ValueError("No valid geometries in the data.")
else:
target_geom_type = geom_types[0].upper()
else:
target_geom_type = "GEOMETRY"
# Check for 3D-coordinates
if any(gdf.geometry.has_z):
target_geom_type += "Z"
return target_geom_type, has_curve
def _get_srid_from_crs(gdf):
"""
Get EPSG code from CRS if available. If not, return 0.
"""
# Use geoalchemy2 default for srid
# Note: undefined srid in PostGIS is 0
srid = None
warning_msg = (
"Could not parse CRS from the GeoDataFrame. "
"Inserting data without defined CRS."
)
if gdf.crs is not None:
try:
for confidence in (100, 70, 25):
srid = gdf.crs.to_epsg(min_confidence=confidence)
if srid is not None:
break
auth_srid = gdf.crs.to_authority(
auth_name="ESRI", min_confidence=confidence
)
if auth_srid is not None:
srid = int(auth_srid[1])
break
except Exception:
warnings.warn(warning_msg, UserWarning, stacklevel=2)
if srid is None:
srid = 0
warnings.warn(warning_msg, UserWarning, stacklevel=2)
return srid
def _convert_linearring_to_linestring(gdf, geom_name):
from shapely.geometry import LineString
# Todo: Use shapely function once it's implemented:
# https://github.com/shapely/shapely/issues/1617
mask = gdf.geom_type == "LinearRing"
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
lambda geom: LineString(geom)
)
return gdf
def _convert_to_ewkb(gdf, geom_name, srid):
"""Convert geometries to ewkb."""
geoms = shapely.to_wkb(
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
# The gdf will warn that the geometry column doesn't hold in-memory geometries
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
# the user that the dtypes are unexpected.
df = pd.DataFrame(gdf, copy=False)
df[geom_name] = geoms
return df
def _psql_insert_copy(tbl, conn, keys, data_iter):
import csv
import io
s_buf = io.StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ", ".join('"{}"'.format(k) for k in keys)
dbapi_conn = conn.connection
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
tbl.table.schema, tbl.table.name, columns
)
with dbapi_conn.cursor() as cur:
# Use psycopg method if it's available
if hasattr(cur, "copy") and callable(cur.copy):
with cur.copy(sql) as copy:
copy.write(s_buf.read())
else: # otherwise use psycopg2 method
cur.copy_expert(sql, s_buf)
def _write_postgis(
gdf,
name,
con,
schema=None,
if_exists="fail",
index=False,
index_label=None,
chunksize=None,
dtype=None,
):
"""
Upload GeoDataFrame into PostGIS database.
This method requires SQLAlchemy and GeoAlchemy2, and a PostgreSQL
Python driver (e.g. psycopg2) to be installed.
Parameters
----------
name : str
Name of the target table.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the PostGIS database.
if_exists : {'fail', 'replace', 'append'}, default 'fail'
How to behave if the table already exists:
- fail: Raise a ValueError.
- replace: Drop the table before inserting new values.
- append: Insert new values to the existing table.
schema : string, optional
Specify the schema. If None, use default schema: 'public'.
index : bool, default True
Write DataFrame index as a column.
Uses *index_label* as the column name in the table.
index_label : string or sequence, default None
Column label for index column(s).
If None is given (default) and index is True,
then the index names are used.
chunksize : int, optional
Rows will be written in batches of this size at a time.
By default, all rows will be written at once.
dtype : dict of column name to SQL type, default None
Specifying the datatype for columns.
The keys should be the column names and the values
should be the SQLAlchemy types.
Examples
--------
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
/mydatabase";) # doctest: +SKIP
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
"""
try:
from geoalchemy2 import Geometry
from sqlalchemy import text
except ImportError:
raise ImportError("'to_postgis()' requires geoalchemy2 package.")
gdf = gdf.copy()
geom_name = gdf.geometry.name
# Get srid
srid = _get_srid_from_crs(gdf)
# Get geometry type and info whether data contains LinearRing.
geometry_type, has_curve = _get_geometry_type(gdf)
# Build dtype with Geometry
if dtype is not None:
dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
else:
dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
# Convert LinearRing geometries to LineString
if has_curve:
gdf = _convert_linearring_to_linestring(gdf, geom_name)
# Convert geometries to EWKB
gdf = _convert_to_ewkb(gdf, geom_name, srid)
if schema is not None:
schema_name = schema
else:
schema_name = "public"
if if_exists == "append":
# Check that the geometry srid matches with the current GeoDataFrame
with _get_conn(con) as connection:
# Only check SRID if table exists
if connection.dialect.has_table(connection, name, schema):
target_srid = connection.execute(
text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema=schema_name, table=name, geom_col=geom_name
)
)
).fetchone()[0]
if target_srid != srid:
msg = (
"The CRS of the target table (EPSG:{epsg_t}) differs from the "
"CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
epsg_t=target_srid, epsg_src=srid
)
)
raise ValueError(msg)
with _get_conn(con) as connection:
gdf.to_sql(
name,
connection,
schema=schema_name,
if_exists=if_exists,
index=index,
index_label=index_label,
chunksize=chunksize,
dtype=dtype,
method=_psql_insert_copy,
)
@lru_cache
def _get_spatial_ref_sys_df(con, srid):
spatial_ref_sys_sql = (
f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
)
return pd.read_sql(spatial_ref_sys_sql, con)

View File

@@ -1,100 +0,0 @@
"""
Script to create the data and write legacy storage (pickle) files.
Based on pandas' generate_legacy_storage_files.py script.
To use this script, create an environment for which you want to
generate pickles, activate the environment, and run this script as:
$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py \
geopandas/geopandas/io/tests/data/pickle/ pickle
This script generates a storage file for the current arch, system,
The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of geopandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with master). These are then compared.
"""
import os
import pickle
import platform
import sys
import pandas as pd
from shapely.geometry import Point
import geopandas
def create_pickle_data():
"""create the pickle data"""
# custom geometry column name
gdf_the_geom = geopandas.GeoDataFrame(
{"a": [1, 2, 3], "the_geom": [Point(1, 1), Point(2, 2), Point(3, 3)]},
geometry="the_geom",
)
# with crs
gdf_crs = geopandas.GeoDataFrame(
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
crs="EPSG:4326",
)
return {"gdf_the_geom": gdf_the_geom, "gdf_crs": gdf_crs}
def platform_name():
return "_".join(
[
str(geopandas.__version__),
"pd-" + str(pd.__version__),
"py-" + str(platform.python_version()),
str(platform.machine()),
str(platform.system().lower()),
]
)
def write_legacy_pickles(output_dir):
print(
"This script generates a storage file for the current arch, system, "
"and python version"
)
print("geopandas version: {}").format(geopandas.__version__)
print(" output dir : {}".format(output_dir))
print(" storage format: pickle")
pth = "{}.pickle".format(platform_name())
fh = open(os.path.join(output_dir, pth), "wb")
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
fh.close()
print("created pickle file: {}".format(pth))
def main():
if len(sys.argv) != 3:
sys.exit(
"Specify output directory and storage type: generate_legacy_"
"storage_files.py <output_dir> <storage_type> "
)
output_dir = str(sys.argv[1])
storage_type = str(sys.argv[2])
if storage_type == "pickle":
write_legacy_pickles(output_dir=output_dir)
else:
sys.exit("storage_type must be one of {'pickle'}")
if __name__ == "__main__":
main()

View File

@@ -1,328 +0,0 @@
import os
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
import geopandas
from geopandas import GeoDataFrame
from .test_file import FIONA_MARK, PYOGRIO_MARK
import pytest
from geopandas.testing import assert_geodataframe_equal
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
# *****************************************
# TEST TOOLING
class _ExpectedError:
def __init__(self, error_type, error_message_match):
self.type = error_type
self.match = error_message_match
class _ExpectedErrorBuilder:
def __init__(self, composite_key):
self.composite_key = composite_key
def to_raise(self, error_type, error_match):
_expected_exceptions[self.composite_key] = _ExpectedError(
error_type, error_match
)
def _expect_writing(gdf, ogr_driver):
return _ExpectedErrorBuilder(_composite_key(gdf, ogr_driver))
def _composite_key(gdf, ogr_driver):
return frozenset([id(gdf), ogr_driver])
def _expected_error_on(gdf, ogr_driver):
composite_key = _composite_key(gdf, ogr_driver)
return _expected_exceptions.get(composite_key, None)
# *****************************************
# TEST CASES
_geodataframes_to_write = []
_expected_exceptions = {}
_CRS = "epsg:4326"
# ------------------
# gdf with Points
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_entrance, city_hall_balcony]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPoint([city_hall_balcony, city_hall_council_chamber]),
MultiPoint([city_hall_entrance, city_hall_balcony, city_hall_council_chamber]),
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Points and MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony],
)
_geodataframes_to_write.append(gdf)
# 'ESRI Shapefile' driver supports writing LineString/MultiLinestring and
# Polygon/MultiPolygon but does not mention Point/MultiPoint
# see https://www.gdal.org/drv_shapefile.html
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with LineStrings
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=city_hall_walls)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), MultiLineString(city_hall_walls)],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with LineStrings and MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygons
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries, vauquelin_place]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPolygon
gdf = GeoDataFrame(
{"a": [1]},
crs=_CRS,
geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygon and MultiPolygon
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and 3D Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, point_3D])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometries only
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, None])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with all shape types mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with all 2D shape types and 3D Point mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
@pytest.fixture(params=_geodataframes_to_write)
def geodataframe(request):
return request.param
@pytest.fixture(
params=[
("GeoJSON", ".geojson"),
("ESRI Shapefile", ".shp"),
("GPKG", ".gpkg"),
("SQLite", ".sqlite"),
]
)
def ogr_driver(request):
return request.param
@pytest.fixture(
params=[
pytest.param("fiona", marks=FIONA_MARK),
pytest.param("pyogrio", marks=PYOGRIO_MARK),
]
)
def engine(request):
return request.param
def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
driver, ext = ogr_driver
output_file = os.path.join(str(tmpdir), "output_file" + ext)
write_kwargs = {}
if driver == "SQLite":
write_kwargs["spatialite"] = True
# This if statement can be removed once minimal fiona version >= 1.8.20
if engine == "fiona":
from packaging.version import Version
import fiona
if Version(fiona.__version__) < Version("1.8.20"):
pytest.skip("SQLite driver only available from version 1.8.20")
# If only 3D Points, geometry_type needs to be specified for spatialite at the
# moment. This if can be removed once the following PR is released:
# https://github.com/geopandas/pyogrio/pull/223
if (
engine == "pyogrio"
and len(geodataframe == 2)
and geodataframe.geometry[0] is None
and geodataframe.geometry[1] is not None
and geodataframe.geometry[1].has_z
):
write_kwargs["geometry_type"] = "Point Z"
expected_error = _expected_error_on(geodataframe, driver)
if expected_error:
with pytest.raises(
RuntimeError, match="Failed to write record|Could not add feature to layer"
):
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
else:
if driver == "SQLite" and engine == "pyogrio":
try:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
except ValueError as e:
if "unrecognized option 'SPATIALITE'" in str(e):
pytest.xfail(
"pyogrio wheels from PyPI do not come with SpatiaLite support. "
f"Error: {e}"
)
raise
else:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
reloaded = geopandas.read_file(output_file, engine=engine)
if driver == "GeoJSON" and engine == "pyogrio":
# For GeoJSON files, the int64 column comes back as int32
reloaded["a"] = reloaded["a"].astype("int64")
assert_geodataframe_equal(geodataframe, reloaded, check_column_type="equiv")

View File

@@ -1,537 +0,0 @@
import contextlib
import json
import os
import pathlib
from packaging.version import Version
import numpy as np
import shapely
from shapely import MultiPoint, Point, box
from geopandas import GeoDataFrame, GeoSeries
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
pytest.importorskip("pyarrow")
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import feather
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
def pa_table(table):
if Version(pa.__version__) < Version("14.0.0"):
return table._pa_table
else:
return pa.table(table)
def pa_array(array):
if Version(pa.__version__) < Version("14.0.0"):
return array._pa_array
else:
return pa.array(array)
def assert_table_equal(left, right, check_metadata=True):
geom_type = left["geometry"].type
# in case of Points (directly the inner fixed_size_list or struct type)
# -> there are NaNs for empties -> we need to compare them separately
# and then fill, because pyarrow.Table.equals considers NaNs as not equal
if pa.types.is_fixed_size_list(geom_type):
left_values = left["geometry"].chunk(0).values
right_values = right["geometry"].chunk(0).values
assert pc.is_nan(left_values).equals(pc.is_nan(right_values))
left_geoms = pa.FixedSizeListArray.from_arrays(
pc.replace_with_mask(left_values, pc.is_nan(left_values), 0.0),
type=left["geometry"].type,
)
right_geoms = pa.FixedSizeListArray.from_arrays(
pc.replace_with_mask(right_values, pc.is_nan(right_values), 0.0),
type=right["geometry"].type,
)
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
elif pa.types.is_struct(geom_type):
left_arr = left["geometry"].chunk(0)
right_arr = right["geometry"].chunk(0)
for i in range(left_arr.type.num_fields):
assert pc.is_nan(left_arr.field(i)).equals(pc.is_nan(right_arr.field(i)))
left_geoms = pa.StructArray.from_arrays(
[
pc.replace_with_mask(
left_arr.field(i), pc.is_nan(left_arr.field(i)), 0.0
)
for i in range(left_arr.type.num_fields)
],
fields=list(left["geometry"].type),
)
right_geoms = pa.StructArray.from_arrays(
[
pc.replace_with_mask(
right_arr.field(i), pc.is_nan(right_arr.field(i)), 0.0
)
for i in range(right_arr.type.num_fields)
],
fields=list(right["geometry"].type),
)
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
if left.equals(right, check_metadata=check_metadata):
return
if not left.schema.equals(right.schema):
raise AssertionError(
"Schema not equal\nLeft:\n{0}\nRight:\n{1}".format(
left.schema, right.schema
)
)
if check_metadata:
if not left.schema.equals(right.schema, check_metadata=True):
if not left.schema.metadata == right.schema.metadata:
raise AssertionError(
"Metadata not equal\nLeft:\n{0}\nRight:\n{1}".format(
left.schema.metadata, right.schema.metadata
)
)
for col in left.schema.names:
assert left.schema.field(col).equals(
right.schema.field(col), check_metadata=True
)
for col in left.column_names:
a_left = pa.concat_arrays(left.column(col).chunks)
a_right = pa.concat_arrays(right.column(col).chunks)
if not a_left.equals(a_right):
raise AssertionError(
"Column '{0}' not equal:\n{1}".format(col, a_left.diff(a_right))
)
raise AssertionError("Tables not equal for unknown reason")
@pytest.mark.skipif(
shapely.geos_version < (3, 9, 0),
reason="Checking for empty is buggy with GEOS<3.9",
) # an old GEOS is installed in the CI builds with the defaults channel
@pytest.mark.parametrize(
"dim",
[
"xy",
pytest.param(
"xyz",
marks=pytest.mark.skipif(
shapely.geos_version < (3, 10, 0),
reason="Cannot write 3D geometries with GEOS<3.10",
),
),
],
)
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
@pytest.mark.parametrize(
"geometry_encoding, interleaved",
[("WKB", None), ("geoarrow", True), ("geoarrow", False)],
ids=["WKB", "geoarrow-interleaved", "geoarrow-separated"],
)
def test_geoarrow_export(geometry_type, dim, geometry_encoding, interleaved):
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df["row_number"] = df["row_number"].astype("int32")
df = GeoDataFrame(df)
df.geometry.array.crs = None
# Read the expected data
if geometry_encoding == "WKB":
filename = f"example-{suffix}-wkb.arrow"
else:
filename = f"example-{suffix}{'-interleaved' if interleaved else ''}.arrow"
expected = feather.read_table(base_path / filename)
# GeoDataFrame -> Arrow Table
result = pa_table(
df.to_arrow(geometry_encoding=geometry_encoding, interleaved=interleaved)
)
# remove the "pandas" metadata
result = result.replace_schema_metadata(None)
mask_nonempty = None
if (
geometry_encoding == "WKB"
and dim == "xyz"
and geometry_type.startswith("multi")
):
# for collections with z dimension, drop the empties because those don't
# roundtrip correctly to WKB
# (https://github.com/libgeos/geos/issues/888)
mask_nonempty = pa.array(np.asarray(~df.geometry.is_empty))
result = result.filter(mask_nonempty)
expected = expected.filter(mask_nonempty)
assert_table_equal(result, expected)
# GeoSeries -> Arrow array
if geometry_encoding != "WKB" and geometry_type == "point":
# for points, we again have to handle NaNs separately, we already did that
# for table so let's just skip this part
return
result_arr = pa_array(
df.geometry.to_arrow(
geometry_encoding=geometry_encoding, interleaved=interleaved
)
)
if mask_nonempty is not None:
result_arr = result_arr.filter(mask_nonempty)
assert result_arr.equals(expected["geometry"].chunk(0))
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_multiple_geometry_crs(encoding):
pytest.importorskip("pyproj")
# ensure each geometry column has its own crs
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
gdf["geom2"] = gdf.geometry.to_crs("epsg:3857")
result = pa_table(gdf.to_arrow(geometry_encoding=encoding))
meta1 = json.loads(
result.schema.field("geometry").metadata[b"ARROW:extension:metadata"]
)
assert json.loads(meta1["crs"])["id"]["code"] == 4326
meta2 = json.loads(
result.schema.field("geom2").metadata[b"ARROW:extension:metadata"]
)
assert json.loads(meta2["crs"])["id"]["code"] == 3857
roundtripped = GeoDataFrame.from_arrow(result)
assert_geodataframe_equal(gdf, roundtripped)
assert gdf.geometry.crs == "epsg:4326"
assert gdf.geom2.crs == "epsg:3857"
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_series_name_crs(encoding):
pytest.importorskip("pyproj")
pytest.importorskip("pyarrow", minversion="14.0.0")
gser = GeoSeries([box(0, 0, 10, 10)], crs="epsg:4326", name="geom")
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
assert field.name == "geom"
assert (
field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb"
if encoding == "WKB"
else b"geoarrow.polygon"
)
meta = json.loads(field.metadata[b"ARROW:extension:metadata"])
assert json.loads(meta["crs"])["id"]["code"] == 4326
# ensure it also works without a name
gser = GeoSeries([box(0, 0, 10, 10)])
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
assert field.name == ""
def test_geoarrow_unsupported_encoding():
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
with pytest.raises(ValueError, match="Expected geometry encoding"):
gdf.to_arrow(geometry_encoding="invalid")
with pytest.raises(ValueError, match="Expected geometry encoding"):
gdf.geometry.to_arrow(geometry_encoding="invalid")
def test_geoarrow_mixed_geometry_types():
gdf = GeoDataFrame(
{"geometry": [Point(0, 0), box(0, 0, 10, 10)]},
crs="epsg:4326",
)
with pytest.raises(ValueError, match="Geometry type combination is not supported"):
gdf.to_arrow(geometry_encoding="geoarrow")
gdf = GeoDataFrame(
{"geometry": [Point(0, 0), MultiPoint([(0, 0), (1, 1)])]},
crs="epsg:4326",
)
result = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert (
result.schema.field("geometry").metadata[b"ARROW:extension:name"]
== b"geoarrow.multipoint"
)
@pytest.mark.parametrize("geom_type", ["point", "polygon"])
@pytest.mark.parametrize(
"encoding, interleaved", [("WKB", True), ("geoarrow", True), ("geoarrow", False)]
)
def test_geoarrow_missing(encoding, interleaved, geom_type):
# dummy test for single geometry type until missing values are included
# in the test data for test_geoarrow_export
gdf = GeoDataFrame(
geometry=[Point(0, 0) if geom_type == "point" else box(0, 0, 10, 10), None],
crs="epsg:4326",
)
if (
encoding == "geoarrow"
and geom_type == "point"
and interleaved
and Version(pa.__version__) < Version("15.0.0")
):
with pytest.raises(
ValueError,
match="Converting point geometries with missing values is not supported",
):
gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved)
return
result = pa_table(gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved))
assert result["geometry"].null_count == 1
assert result["geometry"].is_null().to_pylist() == [False, True]
def test_geoarrow_include_z():
gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1), Point()]})
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert table["geometry"].type.value_field.name == "xy"
assert table["geometry"].type.list_size == 2
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=True))
assert table["geometry"].type.value_field.name == "xyz"
assert table["geometry"].type.list_size == 3
assert np.isnan(table["geometry"].chunk(0).values.to_numpy()[2::3]).all()
gdf = GeoDataFrame({"geometry": [Point(0, 0, 0), Point(1, 1, 1), Point()]})
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert table["geometry"].type.value_field.name == "xyz"
assert table["geometry"].type.list_size == 3
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=False))
assert table["geometry"].type.value_field.name == "xy"
assert table["geometry"].type.list_size == 2
@contextlib.contextmanager
def with_geoarrow_extension_types():
gp = pytest.importorskip("geoarrow.pyarrow")
gp.register_extension_types()
try:
yield
finally:
gp.unregister_extension_types()
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
def test_geoarrow_export_with_extension_types(geometry_type, dim):
# ensure the exported data can be imported by geoarrow-pyarrow and are
# recognized as extension types
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df["row_number"] = df["row_number"].astype("int32")
df = GeoDataFrame(df)
df.geometry.array.crs = None
pytest.importorskip("geoarrow.pyarrow")
with with_geoarrow_extension_types():
result1 = pa_table(df.to_arrow(geometry_encoding="WKB"))
assert isinstance(result1["geometry"].type, pa.ExtensionType)
result2 = pa_table(df.to_arrow(geometry_encoding="geoarrow"))
assert isinstance(result2["geometry"].type, pa.ExtensionType)
result3 = pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
assert isinstance(result3["geometry"].type, pa.ExtensionType)
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
[
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
],
)
def test_geoarrow_import(geometry_type, dim):
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df = GeoDataFrame(df)
df.geometry.crs = None
table1 = feather.read_table(base_path / f"example-{suffix}-wkb.arrow")
result1 = GeoDataFrame.from_arrow(table1)
assert_geodataframe_equal(result1, df)
table2 = feather.read_table(base_path / f"example-{suffix}-interleaved.arrow")
result2 = GeoDataFrame.from_arrow(table2)
assert_geodataframe_equal(result2, df)
table3 = feather.read_table(base_path / f"example-{suffix}.arrow")
result3 = GeoDataFrame.from_arrow(table3)
assert_geodataframe_equal(result3, df)
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_import_geometry_column(encoding):
pytest.importorskip("pyproj")
# ensure each geometry column has its own crs
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)])
gdf["centroid"] = gdf.geometry.centroid
result = GeoDataFrame.from_arrow(pa_table(gdf.to_arrow(geometry_encoding=encoding)))
assert_geodataframe_equal(result, gdf)
assert result.active_geometry_name == "geometry"
result = GeoDataFrame.from_arrow(
pa_table(gdf[["centroid"]].to_arrow(geometry_encoding=encoding))
)
assert result.active_geometry_name == "centroid"
result = GeoDataFrame.from_arrow(
pa_table(gdf.to_arrow(geometry_encoding=encoding)), geometry="centroid"
)
assert result.active_geometry_name == "centroid"
assert_geodataframe_equal(result, gdf.set_geometry("centroid"))
def test_geoarrow_import_missing_geometry():
pytest.importorskip("pyarrow", minversion="14.0.0")
table = pa.table({"a": [0, 1, 2], "b": [0.1, 0.2, 0.3]})
with pytest.raises(ValueError, match="No geometry column found"):
GeoDataFrame.from_arrow(table)
with pytest.raises(ValueError, match="No GeoArrow geometry field found"):
GeoSeries.from_arrow(table["a"].chunk(0))
def test_geoarrow_import_capsule_interface():
# ensure we can import non-pyarrow object
pytest.importorskip("pyarrow", minversion="14.0.0")
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
result = GeoDataFrame.from_arrow(gdf.to_arrow())
assert_geodataframe_equal(result, gdf)
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
def test_geoarrow_import_from_extension_types(geometry_type, dim):
# ensure the exported data can be imported by geoarrow-pyarrow and are
# recognized as extension types
pytest.importorskip("pyproj")
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df = GeoDataFrame(df, crs="EPSG:3857")
pytest.importorskip("geoarrow.pyarrow")
with with_geoarrow_extension_types():
result1 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="WKB"))
)
assert_geodataframe_equal(result1, df)
result2 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="geoarrow"))
)
assert_geodataframe_equal(result2, df)
result3 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
)
assert_geodataframe_equal(result3, df)
def test_geoarrow_import_geoseries():
pytest.importorskip("pyproj")
gp = pytest.importorskip("geoarrow.pyarrow")
ser = GeoSeries.from_wkt(["POINT (1 1)", "POINT (2 2)"], crs="EPSG:3857")
with with_geoarrow_extension_types():
arr = gp.array(ser.to_arrow(geometry_encoding="WKB"))
result = GeoSeries.from_arrow(arr)
assert_geoseries_equal(result, ser)
arr = gp.array(ser.to_arrow(geometry_encoding="geoarrow"))
result = GeoSeries.from_arrow(arr)
assert_geoseries_equal(result, ser)
# the name is lost when going through a pyarrow.Array
ser.name = "name"
arr = gp.array(ser.to_arrow())
result = GeoSeries.from_arrow(arr)
assert result.name is None
# we can specify the name as one of the kwargs
result = GeoSeries.from_arrow(arr, name="test")
assert_geoseries_equal(result, ser)
def test_geoarrow_import_unknown_geoarrow_type():
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
table = pa_table(gdf.to_arrow())
schema = table.schema
new_field = schema.field("geometry").with_metadata(
{
b"ARROW:extension:name": b"geoarrow.unknown",
b"ARROW:extension:metadata": b"{}",
}
)
new_schema = pa.schema([schema.field(0), new_field])
new_table = table.cast(new_schema)
with pytest.raises(TypeError, match="Unknown GeoArrow extension type"):
GeoDataFrame.from_arrow(new_table)

View File

@@ -1,306 +0,0 @@
from collections import OrderedDict
import numpy as np
import pandas as pd
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
from geopandas import GeoDataFrame
from geopandas.io.file import infer_schema
import pytest
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
linestring_3D = LineString(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5546126200639, 45.5086813829106, 300),
(-73.5540185061397, 45.5084409343852, 300),
)
)
polygon_3D = Polygon(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5535801792994, 45.5089539203786, 300),
(-73.5541107525234, 45.5091983609661, 300),
)
)
def test_infer_schema_only_points():
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_points_and_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPoint", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint(
[city_hall_entrance, city_hall_balcony, city_hall_council_chamber]
)
]
)
assert infer_schema(df) == {"geometry": "MultiPoint", "properties": OrderedDict()}
def test_infer_schema_only_linestrings():
df = GeoDataFrame(geometry=city_hall_walls)
assert infer_schema(df) == {"geometry": "LineString", "properties": OrderedDict()}
def test_infer_schema_linestrings_and_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]])
assert infer_schema(df) == {
"geometry": ["MultiLineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls)])
assert infer_schema(df) == {
"geometry": "MultiLineString",
"properties": OrderedDict(),
}
def test_infer_schema_only_polygons():
df = GeoDataFrame(geometry=[city_hall_boundaries, vauquelin_place])
assert infer_schema(df) == {"geometry": "Polygon", "properties": OrderedDict()}
def test_infer_schema_polygons_and_multipolygons():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPolygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipolygons():
df = GeoDataFrame(geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))])
assert infer_schema(df) == {"geometry": "MultiPolygon", "properties": OrderedDict()}
def test_infer_schema_multiple_shape_types():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": [
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_shape_type():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
]
)
assert infer_schema(df) == {
"geometry": [
"3D Point",
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Point():
df = GeoDataFrame(geometry=[city_hall_balcony, point_3D])
assert infer_schema(df) == {
"geometry": ["3D Point", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Points():
df = GeoDataFrame(geometry=[point_3D, point_3D])
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_mixed_3D_linestring():
df = GeoDataFrame(geometry=[city_hall_walls[0], linestring_3D])
assert infer_schema(df) == {
"geometry": ["3D LineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_linestrings():
df = GeoDataFrame(geometry=[linestring_3D, linestring_3D])
assert infer_schema(df) == {
"geometry": "3D LineString",
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Polygon():
df = GeoDataFrame(geometry=[city_hall_boundaries, polygon_3D])
assert infer_schema(df) == {
"geometry": ["3D Polygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Polygons():
df = GeoDataFrame(geometry=[polygon_3D, polygon_3D])
assert infer_schema(df) == {"geometry": "3D Polygon", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_2D_point():
df = GeoDataFrame(geometry=[None, city_hall_entrance])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_3D_point():
df = GeoDataFrame(geometry=[None, point_3D])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_all():
df = GeoDataFrame(geometry=[None, None])
# None geometry type in then replaced by 'Unknown'
# (default geometry type supported by Fiona)
assert infer_schema(df) == {"geometry": "Unknown", "properties": OrderedDict()}
@pytest.mark.parametrize(
"array_data,dtype", [([1, 2**31 - 1], np.int32), ([1, np.nan], pd.Int32Dtype())]
)
def test_infer_schema_int32(array_data, dtype):
int32col = pd.array(data=array_data, dtype=dtype)
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int32_column"] = int32col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int32_column", "int32")]),
}
def test_infer_schema_int64():
int64col = pd.array([1, np.nan], dtype=pd.Int64Dtype())
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int64_column"] = int64col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int64_column", "int")]),
}

View File

@@ -1,56 +0,0 @@
"""
See generate_legacy_storage_files.py for the creation of the legacy files.
"""
import glob
import os
import pathlib
import pandas as pd
import pytest
from geopandas.testing import assert_geodataframe_equal
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
@pytest.fixture(scope="module")
def current_pickle_data():
# our current version pickle data
from .generate_legacy_storage_files import create_pickle_data
return create_pickle_data()
files = glob.glob(str(DATA_PATH / "pickle" / "*.pickle"))
@pytest.fixture(params=files, ids=[p.split("/")[-1] for p in files])
def legacy_pickle(request):
return request.param
@pytest.mark.skip(
reason=(
"shapely 2.0/pygeos-based unpickling currently only works for "
"shapely-2.0/pygeos-written files"
),
)
def test_legacy_pickles(current_pickle_data, legacy_pickle):
result = pd.read_pickle(legacy_pickle)
for name, value in result.items():
expected = current_pickle_data[name]
assert_geodataframe_equal(value, expected)
def test_round_trip_current(tmpdir, current_pickle_data):
data = current_pickle_data
for name, value in data.items():
path = str(tmpdir / "{}.pickle".format(name))
value.to_pickle(path)
result = pd.read_pickle(path)
assert_geodataframe_equal(result, value)
assert isinstance(result.has_sindex, bool)

View File

@@ -1,878 +0,0 @@
"""
Tests here include reading/writing to different types of spatial databases.
The spatial database tests may not work without additional system
configuration. postGIS tests require a test database to have been setup;
see geopandas.tests.util for more information.
"""
import os
import warnings
from importlib.util import find_spec
import pandas as pd
import geopandas
import geopandas._compat as compat
from geopandas import GeoDataFrame, read_file, read_postgis
from geopandas._compat import HAS_PYPROJ
from geopandas.io.sql import _get_conn as get_conn
from geopandas.io.sql import _write_postgis as write_postgis
import pytest
from geopandas.tests.util import (
create_postgis,
create_spatialite,
mock,
validate_boro_df,
)
try:
from sqlalchemy import text
except ImportError:
# Avoid local imports for text in all sqlalchemy tests
# all tests using text use engine_postgis, which ensures sqlalchemy is available
text = str
@pytest.fixture
def df_nybb(nybb_filename):
df = read_file(nybb_filename)
return df
def check_available_postgis_drivers() -> list[str]:
"""Work out which of psycopg2 and psycopg are available.
This prevents tests running if the relevant package isn't installed
(rather than being skipped, as skips are treated as failures during postgis CI)
"""
drivers = []
if find_spec("psycopg"):
drivers.append("psycopg")
if find_spec("psycopg2"):
drivers.append("psycopg2")
return drivers
POSTGIS_DRIVERS = check_available_postgis_drivers()
def prepare_database_credentials() -> dict:
"""Gather postgres connection credentials from environment variables."""
return {
"dbname": "test_geopandas",
"user": os.environ.get("PGUSER"),
"password": os.environ.get("PGPASSWORD"),
"host": os.environ.get("PGHOST"),
"port": os.environ.get("PGPORT"),
}
@pytest.fixture()
def connection_postgis(request):
"""Create a postgres connection using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS."""
psycopg = pytest.importorskip(request.param)
try:
con = psycopg.connect(**prepare_database_credentials())
except psycopg.OperationalError:
pytest.skip("Cannot connect with postgresql database")
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="pandas only supports SQLAlchemy connectable.*"
)
yield con
con.close()
@pytest.fixture()
def engine_postgis(request):
"""
Initiate a sqlalchemy connection engine using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS.
"""
sqlalchemy = pytest.importorskip("sqlalchemy")
from sqlalchemy.engine.url import URL
credentials = prepare_database_credentials()
try:
con = sqlalchemy.create_engine(
URL.create(
drivername=f"postgresql+{request.param}",
username=credentials["user"],
database=credentials["dbname"],
password=credentials["password"],
host=credentials["host"],
port=credentials["port"],
)
)
con.connect()
except Exception:
pytest.skip("Cannot connect with postgresql database")
yield con
con.dispose()
@pytest.fixture()
def connection_spatialite():
"""
Return a memory-based SQLite3 connection with SpatiaLite enabled & initialized.
`The sqlite3 module must be built with loadable extension support
<https://docs.python.org/3/library/sqlite3.html#f1>`_ and
`SpatiaLite <https://www.gaia-gis.it/fossil/libspatialite/index>`_
must be available on the system as a SQLite module.
Packages available on Anaconda meet requirements.
Exceptions
----------
``AttributeError`` on missing support for loadable SQLite extensions
``sqlite3.OperationalError`` on missing SpatiaLite
"""
sqlite3 = pytest.importorskip("sqlite3")
try:
with sqlite3.connect(":memory:") as con:
con.enable_load_extension(True)
con.load_extension("mod_spatialite")
con.execute("SELECT InitSpatialMetaData(TRUE)")
except Exception:
con.close()
pytest.skip("Cannot setup spatialite database")
yield con
con.close()
def drop_table_if_exists(conn_or_engine, table):
sqlalchemy = pytest.importorskip("sqlalchemy")
if sqlalchemy.inspect(conn_or_engine).has_table(table):
metadata = sqlalchemy.MetaData()
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="Did not recognize type 'geometry' of column.*"
)
metadata.reflect(conn_or_engine)
table = metadata.tables.get(table)
if table is not None:
table.drop(conn_or_engine, checkfirst=True)
@pytest.fixture
def df_mixed_single_and_multi():
from shapely.geometry import LineString, MultiLineString, Point
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0), (1, 1)]),
MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
Point(0, 1),
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_geom_collection():
from shapely.geometry import GeometryCollection, LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
"geometry": [
GeometryCollection(
[
Polygon([(0, 0), (1, 1), (0, 1)]),
LineString([(0, 0), (1, 1)]),
Point(0, 0),
]
)
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_linear_ring():
from shapely.geometry import LinearRing
df = geopandas.GeoDataFrame(
{"geometry": [LinearRing(((0, 0), (0, 1), (1, 1), (1, 0)))]}, crs="epsg:4326"
)
return df
@pytest.fixture
def df_3D_geoms():
from shapely.geometry import LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0, 0), (1, 1, 1)]),
Polygon([(0, 0, 0), (1, 1, 1), (0, 1, 1)]),
Point(0, 1, 2),
]
},
crs="epsg:4326",
)
return df
class TestIO:
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_get_conn(self, engine_postgis):
Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
engine = engine_postgis
with get_conn(engine) as output:
assert isinstance(output, Connection)
with engine.connect() as conn:
with get_conn(conn) as output:
assert isinstance(output, Connection)
with pytest.raises(ValueError):
with get_conn(object()):
pass
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
con = connection_postgis
orig_geom = "geom"
out_geom = "the_geom"
create_postgis(con, df_nybb, geom_col=orig_geom)
sql = """SELECT borocode, boroname, shape_leng, shape_area,
{} as {} FROM nybb;""".format(
orig_geom, out_geom
)
df = read_postgis(sql, con, geom_col=out_geom)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
con = connection_postgis
crs = "epsg:4269"
df_reproj = df_nybb.to_crs(crs)
create_postgis(con, df_reproj, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
"""Tests that a user specified CRS overrides the geodatabase SRID."""
con = connection_postgis
orig_crs = df_nybb.crs
create_postgis(con, df_nybb, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, crs=orig_crs)
validate_boro_df(df)
assert df.crs == orig_crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con)
validate_boro_df(df, case_sensitive=False)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df, case_sensitive=False)
def test_read_postgis_null_geom(self, connection_spatialite, df_nybb):
"""Tests that geometry with NULL is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
df_nybb.geometry.iat[0] = None
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'AsEWKB("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
def test_read_postgis_binary(self, connection_spatialite, df_nybb):
"""Tests that geometry read as binary is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'ST_AsBinary("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument"""
chunksize = 2
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_default(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
engine = engine_postgis
table = "aTestTable"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text('SELECT * FROM "{table}";'.format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
with engine_postgis.begin() as con:
table = "nybb_con"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(con, table)
# Write to db
write_postgis(df_nybb, con=con, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, con, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that uploading the same table raises error when: if_replace='fail'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
try:
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
except ValueError as e:
if "already exists" in str(e):
pass
else:
raise e
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that replacing a table is possible when: if_replace='replace'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Overwrite
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that appending to existing table produces correct results when:
if_replace='append'.
"""
engine = engine_postgis
table = "nybb"
orig_rows, orig_cols = df_nybb.shape
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
new_rows, new_cols = df.shape
# There should be twice as many rows in the new table
assert new_rows == orig_rows * 2, (
"There should be {target} rows,found: {current}".format(
target=orig_rows * 2, current=new_rows
),
)
# Number of columns should stay the same
assert new_cols == orig_cols, (
"There should be {target} columns,found: {current}".format(
target=orig_cols, current=new_cols
),
)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS without CRS information.
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb.geometry.array.crs = None
with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate that srid is -1
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 0, "SRID should be 0, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
CRS information (GH #2414).
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb_esri = df_nybb.to_crs("ESRI:102003")
write_postgis(df_nybb_esri, con=engine, name=table, if_exists="replace")
# Validate that srid is 102003
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_geometry_collection(
self, engine_postgis, df_geom_collection
):
"""
Tests that writing a mix of different geometry types is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_geom_collection, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert geom_type.upper() == "GEOMETRYCOLLECTION"
assert df.geom_type.unique()[0] == "GeometryCollection"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_mixed_geometry_types(
self, engine_postgis, df_mixed_single_and_multi
):
"""
Tests that writing a mix of single and MultiGeometries is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi, con=engine, name=table, if_exists="replace"
)
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
"""
Tests that writing a LinearRing.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_linear_ring, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
assert geom_type.upper() == "LINESTRING"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
"""
Tests writing a LinearRing works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi,
con=engine,
name=table,
if_exists="replace",
chunksize=1,
)
# Validate row count
sql = text("SELECT COUNT(geometry) FROM {table};".format(table=table))
with engine.connect() as conn:
row_cnt = conn.execute(sql).fetchone()[0]
assert row_cnt == 3
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema_when_table_exists(
self, engine_postgis, df_nybb
):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
try:
write_postgis(
df_nybb, con=engine, name=table, if_exists="fail", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(
schema=schema_to_use, table=table
)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
# Should raise a ValueError when table exists
except ValueError:
pass
# Try with replace flag on
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
"""
Tests writing a geometries with 3 dimensions works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_3D_geoms, con=engine, name=table, if_exists="replace")
# Check that all geometries have 3 dimensions
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert list(df.geometry.has_z) == [True, True, True]
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_row_order(self, engine_postgis, df_nybb):
"""
Tests that the row order in db table follows the order of the original frame.
"""
engine = engine_postgis
table = "row_order_test"
correct_order = df_nybb["BoroCode"].tolist()
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert df["BoroCode"].tolist() == correct_order
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_before_table_exists(self, engine_postgis, df_nybb):
"""
Tests that insert works with if_exists='append' when table does not exist yet.
"""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_with_different_crs(self, engine_postgis, df_nybb):
"""
Tests that the warning is raised if table CRS differs from frame.
"""
engine = engine_postgis
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Reproject
df_nybb2 = df_nybb.to_crs(epsg=4326)
# Should raise error when appending
with pytest.raises(ValueError, match="CRS of the target table"):
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_without_crs(self, engine_postgis, df_nybb):
# This test was included in #3328 when the default value for no
# CRS was changed from an SRID of -1 to 0. This resolves issues
# of appending dataframes to postgis that have no CRS as postgis
# no CRS value is 0.
engine = engine_postgis
df_nybb = df_nybb.set_crs(None, allow_override=True)
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# append another dataframe with no crs
df_nybb2 = df_nybb
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
@pytest.mark.xfail(
compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
reason="Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1",
)
def test_duplicate_geometry_column_fails(self, engine_postgis):
"""
Tests that a ValueError is raised if an SQL query returns two geometry columns.
"""
engine = engine_postgis
sql = "select ST_MakePoint(0, 0) as geom, ST_MakePoint(0, 0) as geom;"
with pytest.raises(ValueError):
read_postgis(sql, engine, geom_col="geom")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs(self, connection_postgis, df_nybb):
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == "ESRI:54052"
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
@mock.patch("shapely.get_srid")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_srid_not_in_table(self, mock_get_srid, connection_postgis, df_nybb):
# mock a non-existent srid for edge case if shapely has an srid
# not present in postgis table.
pyproj = pytest.importorskip("pyproj")
mock_get_srid.return_value = 99999
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
with pytest.raises(pyproj.exceptions.CRSError, match="crs not found"):
with pytest.warns(UserWarning, match="Could not find srid 99999"):
read_postgis(sql, con)
@mock.patch("geopandas.io.sql._get_spatial_ref_sys_df")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_no_spatial_ref_sys_table_in_postgis(
self, mock_get_spatial_ref_sys_df, connection_postgis, df_nybb
):
# mock for a non-existent spatial_ref_sys database
mock_get_spatial_ref_sys_df.side_effect = pd.errors.DatabaseError
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb, srid=4326)
sql = "SELECT * FROM nybb;"
with pytest.warns(
UserWarning, match="Could not find the spatial reference system table"
):
df = read_postgis(sql, con)
assert df.crs == "EPSG:4326"
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument with non epsg crs"""
chunksize = 2
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
assert df.crs == "ESRI:54052"

View File

@@ -1,118 +0,0 @@
"""Vendored, cut down version of pyogrio/util.py for use with fiona"""
import re
import sys
from urllib.parse import urlparse
def vsi_path(path: str) -> str:
"""
Ensure path is a local path or a GDAL-compatible vsi path.
"""
# path is already in GDAL format
if path.startswith("/vsi"):
return path
# Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
# URL schemes
if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
if not path.split("!")[0].endswith(".zip"):
return path
# prefix then allow to proceed with remaining parsing
path = f"zip://{path}"
path, archive, scheme = _parse_uri(path)
if scheme or archive or path.endswith(".zip"):
return _construct_vsi_path(path, archive, scheme)
return path
# Supported URI schemes and their mapping to GDAL's VSI suffix.
SCHEMES = {
"file": "file",
"zip": "zip",
"tar": "tar",
"gzip": "gzip",
"http": "curl",
"https": "curl",
"ftp": "curl",
"s3": "s3",
"gs": "gs",
"az": "az",
"adls": "adls",
"adl": "adls", # fsspec uses this
"hdfs": "hdfs",
"webhdfs": "webhdfs",
# GDAL additionally supports oss and swift for remote filesystems, but
# those are for now not added as supported URI
}
CURLSCHEMES = {k for k, v in SCHEMES.items() if v == "curl"}
def _parse_uri(path: str):
"""
Parse a URI
Returns a tuples of (path, archive, scheme)
path : str
Parsed path. Includes the hostname and query string in the case
of a URI.
archive : str
Parsed archive path.
scheme : str
URI scheme such as "https" or "zip+s3".
"""
parts = urlparse(path, allow_fragments=False)
# if the scheme is not one of GDAL's supported schemes, return raw path
if parts.scheme and not all(p in SCHEMES for p in parts.scheme.split("+")):
return path, "", ""
# we have a URI
path = parts.path
scheme = parts.scheme or ""
if parts.query:
path += "?" + parts.query
if parts.scheme and parts.netloc:
path = parts.netloc + path
parts = path.split("!")
path = parts.pop() if parts else ""
archive = parts.pop() if parts else ""
return (path, archive, scheme)
def _construct_vsi_path(path, archive, scheme) -> str:
"""Convert a parsed path to a GDAL VSI path"""
prefix = ""
suffix = ""
schemes = scheme.split("+")
if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
schemes.insert(0, "zip")
if schemes:
prefix = "/".join(
"vsi{0}".format(SCHEMES[p]) for p in schemes if p and p != "file"
)
if schemes[-1] in CURLSCHEMES:
suffix = f"{schemes[-1]}://"
if prefix:
if archive:
return "/{}/{}{}/{}".format(prefix, suffix, archive, path.lstrip("/"))
else:
return "/{}/{}{}".format(prefix, suffix, path)
return path

View File

@@ -1,977 +0,0 @@
import warnings
from packaging.version import Version
import numpy as np
import pandas as pd
from pandas import CategoricalDtype
from pandas.plotting import PlotAccessor
import geopandas
from ._decorator import doc
def _sanitize_geoms(geoms, prefix="Multi"):
"""
Returns Series like geoms and index, except that any Multi geometries
are split into their components and indices are repeated for all component
in the same Multi geometry. At the same time, empty or missing geometries are
filtered out. Maintains 1:1 matching of geometry to value.
Prefix specifies type of geometry to be flatten. 'Multi' for MultiPoint and similar,
"Geom" for GeometryCollection.
Returns
-------
components : list of geometry
component_index : index array
indices are repeated for all components in the same Multi geometry
"""
# TODO(shapely) look into simplifying this with
# shapely.get_parts(geoms, return_index=True) from shapely 2.0
components, component_index = [], []
if (
not geoms.geom_type.str.startswith(prefix).any()
and not geoms.is_empty.any()
and not geoms.isna().any()
):
return geoms, np.arange(len(geoms))
for ix, geom in enumerate(geoms):
if geom is not None and geom.geom_type.startswith(prefix) and not geom.is_empty:
for poly in geom.geoms:
components.append(poly)
component_index.append(ix)
elif geom is None or geom.is_empty:
continue
else:
components.append(geom)
component_index.append(ix)
return components, np.array(component_index)
def _expand_kwargs(kwargs, multiindex):
"""
Most arguments to the plot functions must be a (single) value, or a sequence
of values. This function checks each key-value pair in 'kwargs' and expands
it (in place) to the correct length/formats with help of 'multiindex', unless
the value appears to already be a valid (single) value for the key.
"""
from typing import Iterable
from matplotlib.colors import is_color_like
scalar_kwargs = ["marker", "path_effects"]
for att, value in kwargs.items():
if "color" in att: # color(s), edgecolor(s), facecolor(s)
if is_color_like(value):
continue
elif "linestyle" in att: # linestyle(s)
# A single linestyle can be 2-tuple of a number and an iterable.
if (
isinstance(value, tuple)
and len(value) == 2
and isinstance(value[1], Iterable)
):
continue
elif att in scalar_kwargs:
# For these attributes, only a single value is allowed, so never expand.
continue
if pd.api.types.is_list_like(value):
kwargs[att] = np.take(value, multiindex, axis=0)
def _PolygonPatch(polygon, **kwargs):
"""Constructs a matplotlib patch from a Polygon geometry
The `kwargs` are those supported by the matplotlib.patches.PathPatch class
constructor. Returns an instance of matplotlib.patches.PathPatch.
Example (using Shapely Point and a matplotlib axes)::
b = shapely.geometry.Point(0, 0).buffer(1.0)
patch = _PolygonPatch(b, fc='blue', ec='blue', alpha=0.5)
ax.add_patch(patch)
GeoPandas originally relied on the descartes package by Sean Gillies
(BSD license, https://pypi.org/project/descartes) for PolygonPatch, but
this dependency was removed in favor of the below matplotlib code.
"""
from matplotlib.patches import PathPatch
from matplotlib.path import Path
path = Path.make_compound_path(
Path(np.asarray(polygon.exterior.coords)[:, :2]),
*[Path(np.asarray(ring.coords)[:, :2]) for ring in polygon.interiors],
)
return PathPatch(path, **kwargs)
def _plot_polygon_collection(
ax,
geoms,
values=None,
color=None,
cmap=None,
vmin=None,
vmax=None,
autolim=True,
**kwargs,
):
"""
Plots a collection of Polygon and MultiPolygon geometries to `ax`
Parameters
----------
ax : matplotlib.axes.Axes
where shapes will be plotted
geoms : a sequence of `N` Polygons and/or MultiPolygons (can be mixed)
values : a sequence of `N` values, optional
Values will be mapped to colors using vmin/vmax/cmap. They should
have 1:1 correspondence with the geometries (not their components).
Otherwise follows `color` / `facecolor` kwargs.
edgecolor : single color or sequence of `N` colors
Color for the edge of the polygons
facecolor : single color or sequence of `N` colors
Color to fill the polygons. Cannot be used together with `values`.
color : single color or sequence of `N` colors
Sets both `edgecolor` and `facecolor`
autolim : bool (default True)
Update axes data limits to contain the new geometries.
**kwargs
Additional keyword arguments passed to the collection
Returns
-------
collection : matplotlib.collections.Collection that was plotted
"""
from matplotlib.collections import PatchCollection
geoms, multiindex = _sanitize_geoms(geoms)
if values is not None:
values = np.take(values, multiindex, axis=0)
# PatchCollection does not accept some kwargs.
kwargs = {
att: value
for att, value in kwargs.items()
if att not in ["markersize", "marker"]
}
# Add to kwargs for easier checking below.
if color is not None:
kwargs["color"] = color
_expand_kwargs(kwargs, multiindex)
collection = PatchCollection([_PolygonPatch(poly) for poly in geoms], **kwargs)
if values is not None:
collection.set_array(np.asarray(values))
collection.set_cmap(cmap)
if "norm" not in kwargs:
collection.set_clim(vmin, vmax)
ax.add_collection(collection, autolim=autolim)
ax.autoscale_view()
return collection
def _plot_linestring_collection(
ax,
geoms,
values=None,
color=None,
cmap=None,
vmin=None,
vmax=None,
autolim=True,
**kwargs,
):
"""
Plots a collection of LineString and MultiLineString geometries to `ax`
Parameters
----------
ax : matplotlib.axes.Axes
where shapes will be plotted
geoms : a sequence of `N` LineStrings and/or MultiLineStrings (can be
mixed)
values : a sequence of `N` values, optional
Values will be mapped to colors using vmin/vmax/cmap. They should
have 1:1 correspondence with the geometries (not their components).
color : single color or sequence of `N` colors
Cannot be used together with `values`.
autolim : bool (default True)
Update axes data limits to contain the new geometries.
Returns
-------
collection : matplotlib.collections.Collection that was plotted
"""
from matplotlib.collections import LineCollection
geoms, multiindex = _sanitize_geoms(geoms)
if values is not None:
values = np.take(values, multiindex, axis=0)
# LineCollection does not accept some kwargs.
kwargs = {
att: value
for att, value in kwargs.items()
if att not in ["markersize", "marker"]
}
# Add to kwargs for easier checking below.
if color is not None:
kwargs["color"] = color
_expand_kwargs(kwargs, multiindex)
segments = [np.array(linestring.coords)[:, :2] for linestring in geoms]
collection = LineCollection(segments, **kwargs)
if values is not None:
collection.set_array(np.asarray(values))
collection.set_cmap(cmap)
if "norm" not in kwargs:
collection.set_clim(vmin, vmax)
ax.add_collection(collection, autolim=autolim)
ax.autoscale_view()
return collection
def _plot_point_collection(
ax,
geoms,
values=None,
color=None,
cmap=None,
vmin=None,
vmax=None,
marker="o",
markersize=None,
**kwargs,
):
"""
Plots a collection of Point and MultiPoint geometries to `ax`
Parameters
----------
ax : matplotlib.axes.Axes
where shapes will be plotted
geoms : sequence of `N` Points or MultiPoints
values : a sequence of `N` values, optional
Values mapped to colors using vmin, vmax, and cmap.
Cannot be specified together with `color`.
markersize : scalar or array-like, optional
Size of the markers. Note that under the hood ``scatter`` is
used, so the specified value will be proportional to the
area of the marker (size in points^2).
Returns
-------
collection : matplotlib.collections.Collection that was plotted
"""
if values is not None and color is not None:
raise ValueError("Can only specify one of 'values' and 'color' kwargs")
geoms, multiindex = _sanitize_geoms(geoms)
# values are expanded below as kwargs["c"]
x = [p.x if not p.is_empty else None for p in geoms]
y = [p.y if not p.is_empty else None for p in geoms]
# matplotlib 1.4 does not support c=None, and < 2.0 does not support s=None
if values is not None:
kwargs["c"] = values
if markersize is not None:
kwargs["s"] = markersize
# Add to kwargs for easier checking below.
if color is not None:
kwargs["color"] = color
if marker is not None:
kwargs["marker"] = marker
_expand_kwargs(kwargs, multiindex)
if "norm" not in kwargs:
collection = ax.scatter(x, y, vmin=vmin, vmax=vmax, cmap=cmap, **kwargs)
else:
collection = ax.scatter(x, y, cmap=cmap, **kwargs)
return collection
def plot_series(
s,
cmap=None,
color=None,
ax=None,
figsize=None,
aspect="auto",
autolim=True,
**style_kwds,
):
"""
Plot a GeoSeries.
Generate a plot of a GeoSeries geometry with matplotlib.
Parameters
----------
s : Series
The GeoSeries to be plotted. Currently Polygon,
MultiPolygon, LineString, MultiLineString, Point and MultiPoint
geometries can be plotted.
cmap : str (default None)
The name of a colormap recognized by matplotlib. Any
colormap will work, but categorical colormaps are
generally recommended. Examples of useful discrete
colormaps include:
tab10, tab20, Accent, Dark2, Paired, Pastel1, Set1, Set2
color : str, np.array, pd.Series, List (default None)
If specified, all objects will be colored uniformly.
ax : matplotlib.pyplot.Artist (default None)
axes on which to draw the plot
figsize : pair of floats (default None)
Size of the resulting matplotlib.figure.Figure. If the argument
ax is given explicitly, figsize is ignored.
aspect : 'auto', 'equal', None or float (default 'auto')
Set aspect of axis. If 'auto', the default aspect for map plots is 'equal'; if
however data are not projected (coordinates are long/lat), the aspect is by
default set to 1/cos(s_y * pi/180) with s_y the y coordinate of the middle of
the GeoSeries (the mean of the y range of bounding box) so that a long/lat
square appears square in the middle of the plot. This implies an
Equirectangular projection. If None, the aspect of `ax` won't be changed. It can
also be set manually (float) as the ratio of y-unit to x-unit.
autolim : bool (default True)
Update axes data limits to contain the new geometries.
**style_kwds : dict
Color options to be passed on to the actual plot function, such
as ``edgecolor``, ``facecolor``, ``linewidth``, ``markersize``,
``alpha``.
Returns
-------
ax : matplotlib axes instance
"""
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
"The matplotlib package is required for plotting in geopandas. "
"You can install it using 'conda install -c conda-forge matplotlib' or "
"'pip install matplotlib'."
)
if ax is None:
fig, ax = plt.subplots(figsize=figsize)
if aspect == "auto":
if s.crs and s.crs.is_geographic:
bounds = s.total_bounds
y_coord = np.mean([bounds[1], bounds[3]])
ax.set_aspect(1 / np.cos(y_coord * np.pi / 180))
# formula ported from R package sp
# https://github.com/edzer/sp/blob/master/R/mapasp.R
else:
ax.set_aspect("equal")
elif aspect is not None:
ax.set_aspect(aspect)
if s.empty:
warnings.warn(
"The GeoSeries you are attempting to plot is "
"empty. Nothing has been displayed.",
UserWarning,
stacklevel=3,
)
return ax
if s.is_empty.all():
warnings.warn(
"The GeoSeries you are attempting to plot is "
"composed of empty geometries. Nothing has been displayed.",
UserWarning,
stacklevel=3,
)
return ax
# have colors been given for all geometries?
color_given = pd.api.types.is_list_like(color) and len(color) == len(s)
# if cmap is specified, create range of colors based on cmap
values = None
if cmap is not None:
values = np.arange(len(s))
if hasattr(cmap, "N"):
values = values % cmap.N
style_kwds["vmin"] = style_kwds.get("vmin", values.min())
style_kwds["vmax"] = style_kwds.get("vmax", values.max())
# decompose GeometryCollections
geoms, multiindex = _sanitize_geoms(s.geometry, prefix="Geom")
values = np.take(values, multiindex, axis=0) if cmap else None
# ensure indexes are consistent
if color_given and isinstance(color, pd.Series):
color = color.reindex(s.index)
expl_color = np.take(color, multiindex, axis=0) if color_given else color
expl_series = geopandas.GeoSeries(geoms)
geom_types = expl_series.geom_type
poly_idx = np.asarray((geom_types == "Polygon") | (geom_types == "MultiPolygon"))
line_idx = np.asarray(
(geom_types == "LineString")
| (geom_types == "MultiLineString")
| (geom_types == "LinearRing")
)
point_idx = np.asarray((geom_types == "Point") | (geom_types == "MultiPoint"))
# plot all Polygons and all MultiPolygon components in the same collection
polys = expl_series[poly_idx]
if not polys.empty:
# color overrides both face and edgecolor. As we want people to be
# able to use edgecolor as well, pass color to facecolor
facecolor = style_kwds.pop("facecolor", None)
color_ = expl_color[poly_idx] if color_given else color
if color is not None:
facecolor = color_
values_ = values[poly_idx] if cmap else None
_plot_polygon_collection(
ax,
polys,
values_,
facecolor=facecolor,
cmap=cmap,
autolim=autolim,
**style_kwds,
)
# plot all LineStrings and MultiLineString components in same collection
lines = expl_series[line_idx]
if not lines.empty:
values_ = values[line_idx] if cmap else None
color_ = expl_color[line_idx] if color_given else color
_plot_linestring_collection(
ax, lines, values_, color=color_, cmap=cmap, autolim=autolim, **style_kwds
)
# plot all Points in the same collection
points = expl_series[point_idx]
if not points.empty:
values_ = values[point_idx] if cmap else None
color_ = expl_color[point_idx] if color_given else color
_plot_point_collection(
ax, points, values_, color=color_, cmap=cmap, **style_kwds
)
ax.figure.canvas.draw_idle()
return ax
def plot_dataframe(
df,
column=None,
cmap=None,
color=None,
ax=None,
cax=None,
categorical=False,
legend=False,
scheme=None,
k=5,
vmin=None,
vmax=None,
markersize=None,
figsize=None,
legend_kwds=None,
categories=None,
classification_kwds=None,
missing_kwds=None,
aspect="auto",
autolim=True,
**style_kwds,
):
"""
Plot a GeoDataFrame.
Generate a plot of a GeoDataFrame with matplotlib. If a
column is specified, the plot coloring will be based on values
in that column.
Parameters
----------
column : str, np.array, pd.Series (default None)
The name of the dataframe column, np.array, or pd.Series to be plotted.
If np.array or pd.Series are used then it must have same length as
dataframe. Values are used to color the plot. Ignored if `color` is
also set.
kind: str
The kind of plots to produce. The default is to create a map ("geo").
Other supported kinds of plots from pandas:
- 'line' : line plot
- 'bar' : vertical bar plot
- 'barh' : horizontal bar plot
- 'hist' : histogram
- 'box' : BoxPlot
- 'kde' : Kernel Density Estimation plot
- 'density' : same as 'kde'
- 'area' : area plot
- 'pie' : pie plot
- 'scatter' : scatter plot
- 'hexbin' : hexbin plot.
cmap : str (default None)
The name of a colormap recognized by matplotlib.
color : str, np.array, pd.Series (default None)
If specified, all objects will be colored uniformly.
ax : matplotlib.pyplot.Artist (default None)
axes on which to draw the plot
cax : matplotlib.pyplot Artist (default None)
axes on which to draw the legend in case of color map.
categorical : bool (default False)
If False, cmap will reflect numerical values of the
column being plotted. For non-numerical columns, this
will be set to True.
legend : bool (default False)
Plot a legend. Ignored if no `column` is given, or if `color` is given.
scheme : str (default None)
Name of a choropleth classification scheme (requires mapclassify).
A mapclassify.MapClassifier object will be used
under the hood. Supported are all schemes provided by mapclassify (e.g.
'BoxPlot', 'EqualInterval', 'FisherJenks', 'FisherJenksSampled',
'HeadTailBreaks', 'JenksCaspall', 'JenksCaspallForced',
'JenksCaspallSampled', 'MaxP', 'MaximumBreaks',
'NaturalBreaks', 'Quantiles', 'Percentiles', 'StdMean',
'UserDefined'). Arguments can be passed in classification_kwds.
k : int (default 5)
Number of classes (ignored if scheme is None)
vmin : None or float (default None)
Minimum value of cmap. If None, the minimum data value
in the column to be plotted is used.
vmax : None or float (default None)
Maximum value of cmap. If None, the maximum data value
in the column to be plotted is used.
markersize : str or float or sequence (default None)
Only applies to point geometries within a frame.
If a str, will use the values in the column of the frame specified
by markersize to set the size of markers. Otherwise can be a value
to apply to all points, or a sequence of the same length as the
number of points.
figsize : tuple of integers (default None)
Size of the resulting matplotlib.figure.Figure. If the argument
axes is given explicitly, figsize is ignored.
legend_kwds : dict (default None)
Keyword arguments to pass to :func:`matplotlib.pyplot.legend` or
:func:`matplotlib.pyplot.colorbar`.
Additional accepted keywords when `scheme` is specified:
fmt : string
A formatting specification for the bin edges of the classes in the
legend. For example, to have no decimals: ``{"fmt": "{:.0f}"}``.
labels : list-like
A list of legend labels to override the auto-generated labels.
Needs to have the same number of elements as the number of
classes (`k`).
interval : boolean (default False)
An option to control brackets from mapclassify legend.
If True, open/closed interval brackets are shown in the legend.
categories : list-like
Ordered list-like object of categories to be used for categorical plot.
classification_kwds : dict (default None)
Keyword arguments to pass to mapclassify
missing_kwds : dict (default None)
Keyword arguments specifying color options (as style_kwds)
to be passed on to geometries with missing values in addition to
or overwriting other style kwds. If None, geometries with missing
values are not plotted.
aspect : 'auto', 'equal', None or float (default 'auto')
Set aspect of axis. If 'auto', the default aspect for map plots is 'equal'; if
however data are not projected (coordinates are long/lat), the aspect is by
default set to 1/cos(df_y * pi/180) with df_y the y coordinate of the middle of
the GeoDataFrame (the mean of the y range of bounding box) so that a long/lat
square appears square in the middle of the plot. This implies an
Equirectangular projection. If None, the aspect of `ax` won't be changed. It can
also be set manually (float) as the ratio of y-unit to x-unit.
autolim : bool (default True)
Update axes data limits to contain the new geometries.
**style_kwds : dict
Style options to be passed on to the actual plot function, such
as ``edgecolor``, ``facecolor``, ``linewidth``, ``markersize``,
``alpha``.
Returns
-------
ax : matplotlib axes instance
Examples
--------
>>> import geodatasets
>>> df = geopandas.read_file(geodatasets.get_path("nybb"))
>>> df.head() # doctest: +SKIP
BoroCode ... geometry
0 5 ... MULTIPOLYGON (((970217.022 145643.332, 970227....
1 4 ... MULTIPOLYGON (((1029606.077 156073.814, 102957...
2 3 ... MULTIPOLYGON (((1021176.479 151374.797, 102100...
3 1 ... MULTIPOLYGON (((981219.056 188655.316, 980940....
4 2 ... MULTIPOLYGON (((1012821.806 229228.265, 101278...
>>> df.plot("BoroName", cmap="Set1") # doctest: +SKIP
See the User Guide page :doc:`../../user_guide/mapping` for details.
"""
if column is not None and color is not None:
warnings.warn(
"Only specify one of 'column' or 'color'. Using 'color'.",
UserWarning,
stacklevel=3,
)
column = None
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
"The matplotlib package is required for plotting in geopandas. "
"You can install it using 'conda install -c conda-forge matplotlib' or "
"'pip install matplotlib'."
)
if ax is None:
if cax is not None:
raise ValueError("'ax' can not be None if 'cax' is not.")
fig, ax = plt.subplots(figsize=figsize)
if aspect == "auto":
if df.crs and df.crs.is_geographic:
bounds = df.total_bounds
y_coord = np.mean([bounds[1], bounds[3]])
ax.set_aspect(1 / np.cos(y_coord * np.pi / 180))
# formula ported from R package sp
# https://github.com/edzer/sp/blob/master/R/mapasp.R
else:
ax.set_aspect("equal")
elif aspect is not None:
ax.set_aspect(aspect)
# GH 1555
# if legend_kwds set, copy so we don't update it in place
if legend_kwds is not None:
legend_kwds = legend_kwds.copy()
if df.empty:
warnings.warn(
"The GeoDataFrame you are attempting to plot is "
"empty. Nothing has been displayed.",
UserWarning,
stacklevel=3,
)
return ax
if isinstance(markersize, str):
markersize = df[markersize].values
if column is None:
return plot_series(
df.geometry,
cmap=cmap,
color=color,
ax=ax,
figsize=figsize,
markersize=markersize,
aspect=aspect,
autolim=autolim,
**style_kwds,
)
# To accept pd.Series and np.arrays as column
if isinstance(column, (np.ndarray, pd.Series)):
if column.shape[0] != df.shape[0]:
raise ValueError(
"The dataframe and given column have different number of rows."
)
else:
values = column
# Make sure index of a Series matches index of df
if isinstance(values, pd.Series):
values = values.reindex(df.index)
else:
values = df[column]
if isinstance(values.dtype, CategoricalDtype):
if categories is not None:
raise ValueError(
"Cannot specify 'categories' when column has categorical dtype"
)
categorical = True
elif (
pd.api.types.is_object_dtype(values.dtype)
or pd.api.types.is_bool_dtype(values.dtype)
or pd.api.types.is_string_dtype(values.dtype)
or categories
):
categorical = True
nan_idx = np.asarray(pd.isna(values), dtype="bool")
if scheme is not None:
mc_err = (
"The 'mapclassify' package (>= 2.4.0) is "
"required to use the 'scheme' keyword."
)
try:
import mapclassify
except ImportError:
raise ImportError(mc_err)
if Version(mapclassify.__version__) < Version("2.4.0"):
raise ImportError(mc_err)
if classification_kwds is None:
classification_kwds = {}
if "k" not in classification_kwds:
classification_kwds["k"] = k
binning = mapclassify.classify(
np.asarray(values[~nan_idx]), scheme, **classification_kwds
)
# set categorical to True for creating the legend
categorical = True
if legend_kwds is not None and "labels" in legend_kwds:
if len(legend_kwds["labels"]) != binning.k:
raise ValueError(
"Number of labels must match number of bins, "
"received {} labels for {} bins".format(
len(legend_kwds["labels"]), binning.k
)
)
else:
labels = list(legend_kwds.pop("labels"))
else:
fmt = "{:.2f}"
if legend_kwds is not None and "fmt" in legend_kwds:
fmt = legend_kwds.pop("fmt")
labels = binning.get_legend_classes(fmt)
if legend_kwds is not None:
show_interval = legend_kwds.pop("interval", False)
else:
show_interval = False
if not show_interval:
labels = [c[1:-1] for c in labels]
values = pd.Categorical(
[np.nan] * len(values), categories=binning.bins, ordered=True
)
values[~nan_idx] = pd.Categorical.from_codes(
binning.yb, categories=binning.bins, ordered=True
)
if cmap is None:
cmap = "viridis"
# Define `values` as a Series
if categorical:
if cmap is None:
cmap = "tab10"
cat = pd.Categorical(values, categories=categories)
categories = list(cat.categories)
# values missing in the Categorical but not in original values
missing = list(np.unique(values[~nan_idx & cat.isna()]))
if missing:
raise ValueError(
"Column contains values not listed in categories. "
"Missing categories: {}.".format(missing)
)
values = cat.codes[~nan_idx]
vmin = 0 if vmin is None else vmin
vmax = len(categories) - 1 if vmax is None else vmax
# fill values with placeholder where were NaNs originally to map them properly
# (after removing them in categorical or scheme)
if categorical:
for n in np.where(nan_idx)[0]:
values = np.insert(values, n, values[0])
mn = values[~np.isnan(values)].min() if vmin is None else vmin
mx = values[~np.isnan(values)].max() if vmax is None else vmax
# decompose GeometryCollections
geoms, multiindex = _sanitize_geoms(df.geometry, prefix="Geom")
values = np.take(values, multiindex, axis=0)
nan_idx = np.take(nan_idx, multiindex, axis=0)
expl_series = geopandas.GeoSeries(geoms)
geom_types = expl_series.geom_type
poly_idx = np.asarray((geom_types == "Polygon") | (geom_types == "MultiPolygon"))
line_idx = np.asarray(
(geom_types == "LineString")
| (geom_types == "MultiLineString")
| (geom_types == "LinearRing")
)
point_idx = np.asarray((geom_types == "Point") | (geom_types == "MultiPoint"))
# plot all Polygons and all MultiPolygon components in the same collection
polys = expl_series[poly_idx & np.invert(nan_idx)]
subset = values[poly_idx & np.invert(nan_idx)]
if not polys.empty:
_plot_polygon_collection(
ax,
polys,
subset,
vmin=mn,
vmax=mx,
cmap=cmap,
autolim=autolim,
**style_kwds,
)
# plot all LineStrings and MultiLineString components in same collection
lines = expl_series[line_idx & np.invert(nan_idx)]
subset = values[line_idx & np.invert(nan_idx)]
if not lines.empty:
_plot_linestring_collection(
ax,
lines,
subset,
vmin=mn,
vmax=mx,
cmap=cmap,
autolim=autolim,
**style_kwds,
)
# plot all Points in the same collection
points = expl_series[point_idx & np.invert(nan_idx)]
subset = values[point_idx & np.invert(nan_idx)]
if not points.empty:
if isinstance(markersize, np.ndarray):
markersize = np.take(markersize, multiindex, axis=0)
markersize = markersize[point_idx & np.invert(nan_idx)]
_plot_point_collection(
ax,
points,
subset,
vmin=mn,
vmax=mx,
markersize=markersize,
cmap=cmap,
**style_kwds,
)
missing_data = not expl_series[nan_idx].empty
if missing_kwds is not None and missing_data:
if color:
if "color" not in missing_kwds:
missing_kwds["color"] = color
merged_kwds = style_kwds.copy()
merged_kwds.update(missing_kwds)
plot_series(expl_series[nan_idx], ax=ax, **merged_kwds)
if legend and not color:
if legend_kwds is None:
legend_kwds = {}
if "fmt" in legend_kwds:
legend_kwds.pop("fmt")
from matplotlib import cm
from matplotlib.colors import Normalize
from matplotlib.lines import Line2D
norm = style_kwds.get("norm", None)
if not norm:
norm = Normalize(vmin=mn, vmax=mx)
n_cmap = cm.ScalarMappable(norm=norm, cmap=cmap)
if categorical:
if scheme is not None:
categories = labels
patches = []
for i in range(len(categories)):
patches.append(
Line2D(
[0],
[0],
linestyle="none",
marker="o",
alpha=style_kwds.get("alpha", 1),
markersize=10,
markerfacecolor=n_cmap.to_rgba(i),
markeredgewidth=0,
)
)
if missing_kwds is not None and missing_data:
if "color" in merged_kwds:
merged_kwds["facecolor"] = merged_kwds["color"]
patches.append(
Line2D(
[0],
[0],
linestyle="none",
marker="o",
alpha=merged_kwds.get("alpha", 1),
markersize=10,
markerfacecolor=merged_kwds.get("facecolor", None),
markeredgecolor=merged_kwds.get("edgecolor", None),
markeredgewidth=merged_kwds.get(
"linewidth", 1 if merged_kwds.get("edgecolor", False) else 0
),
)
)
categories.append(merged_kwds.get("label", "NaN"))
legend_kwds.setdefault("numpoints", 1)
legend_kwds.setdefault("loc", "best")
legend_kwds.setdefault("handles", patches)
legend_kwds.setdefault("labels", categories)
ax.legend(**legend_kwds)
else:
if cax is not None:
legend_kwds.setdefault("cax", cax)
else:
legend_kwds.setdefault("ax", ax)
n_cmap.set_array(np.array([]))
ax.get_figure().colorbar(n_cmap, **legend_kwds)
ax.figure.canvas.draw_idle()
return ax
@doc(plot_dataframe)
class GeoplotAccessor(PlotAccessor):
_pandas_kinds = PlotAccessor._all_kinds
def __call__(self, *args, **kwargs):
data = self._parent.copy()
kind = kwargs.pop("kind", "geo")
if kind == "geo":
return plot_dataframe(data, *args, **kwargs)
if kind in self._pandas_kinds:
# Access pandas plots
return PlotAccessor(data)(kind=kind, **kwargs)
else:
# raise error
raise ValueError(f"{kind} is not a valid plot kind")
def geo(self, *args, **kwargs):
return self(kind="geo", *args, **kwargs) # noqa: B026

View File

@@ -1,505 +0,0 @@
import numpy as np
import shapely
from shapely.geometry.base import BaseGeometry
from . import _compat as compat
from . import array, geoseries
PREDICATES = {p.name for p in shapely.strtree.BinaryPredicate} | {None}
if compat.GEOS_GE_310:
PREDICATES.update(["dwithin"])
class SpatialIndex:
"""A simple wrapper around Shapely's STRTree.
Parameters
----------
geometry : np.array of Shapely geometries
Geometries from which to build the spatial index.
"""
def __init__(self, geometry):
# set empty geometries to None to avoid segfault on GEOS <= 3.6
# see:
# https://github.com/pygeos/pygeos/issues/146
# https://github.com/pygeos/pygeos/issues/147
non_empty = geometry.copy()
non_empty[shapely.is_empty(non_empty)] = None
# set empty geometries to None to maintain indexing
self._tree = shapely.STRtree(non_empty)
# store geometries, including empty geometries for user access
self.geometries = geometry.copy()
@property
def valid_query_predicates(self):
"""Returns valid predicates for the spatial index.
Returns
-------
set
Set of valid predicates for this spatial index.
Examples
--------
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries([Point(0, 0), Point(1, 1)])
>>> s.sindex.valid_query_predicates # doctest: +SKIP
{None, "contains", "contains_properly", "covered_by", "covers", \
"crosses", "dwithin", "intersects", "overlaps", "touches", "within"}
"""
return PREDICATES
def query(
self, geometry, predicate=None, sort=False, distance=None, output_format="tuple"
):
"""
Return the integer indices of all combinations of each input geometry
and tree geometries where the bounding box of each input geometry
intersects the bounding box of a tree geometry.
If the input geometry is a scalar, this returns an array of shape (n, ) with
the indices of the matching tree geometries. If the input geometry is an
array_like, this returns an array with shape (2,n) where the subarrays
correspond to the indices of the input geometries and indices of the
tree geometries associated with each. To generate an array of pairs of
input geometry index and tree geometry index, simply transpose the
result.
If a predicate is provided, the tree geometries are first queried based
on the bounding box of the input geometry and then are further filtered
to those that meet the predicate when comparing the input geometry to
the tree geometry: ``predicate(geometry, tree_geometry)``.
The 'dwithin' predicate requires GEOS >= 3.10.
Bounding boxes are limited to two dimensions and are axis-aligned
(equivalent to the ``bounds`` property of a geometry); any Z values
present in input geometries are ignored when querying the tree.
Any input geometry that is None or empty will never match geometries in
the tree.
Parameters
----------
geometry : shapely.Geometry or array-like of geometries \
(numpy.ndarray, GeoSeries, GeometryArray)
A single shapely geometry or array of geometries to query against
the spatial index. For array-like, accepts both GeoPandas geometry
iterables (GeoSeries, GeometryArray) or a numpy array of Shapely
geometries.
predicate : {None, "contains", "contains_properly", "covered_by", "covers", \
"crosses", "intersects", "overlaps", "touches", "within", "dwithin"}, optional
If predicate is provided, the input geometries are tested
using the predicate function against each item in the tree
whose extent intersects the envelope of the input geometry:
``predicate(input_geometry, tree_geometry)``.
If possible, prepared geometries are used to help speed up the
predicate operation.
sort : bool, default False
If True, the results will be sorted in ascending order. In case
of 2D array, the result is sorted lexicographically using the
geometries' indexes as the primary key and the sindex's indexes
as the secondary key.
If False, no additional sorting is applied (results are often
sorted but there is no guarantee).
distance : number or array_like, optional
Distances around each input geometry within which to query the tree for
the 'dwithin' predicate. If array_like, shape must be broadcastable to shape
of geometry. Required if ``predicate='dwithin'``.
Returns
-------
ndarray with shape (n,) if geometry is a scalar
Integer indices for matching geometries from the spatial index
tree geometries.
OR
ndarray with shape (2, n) if geometry is an array_like
The first subarray contains input geometry integer indices.
The second subarray contains tree geometry integer indices.
Examples
--------
>>> from shapely.geometry import Point, box
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
Querying the tree with a scalar geometry:
>>> s.sindex.query(box(1, 1, 3, 3))
array([1, 2, 3])
>>> s.sindex.query(box(1, 1, 3, 3), predicate="contains")
array([2])
Querying the tree with an array of geometries:
>>> s2 = geopandas.GeoSeries([box(2, 2, 4, 4), box(5, 5, 6, 6)])
>>> s2
0 POLYGON ((4 2, 4 4, 2 4, 2 2, 4 2))
1 POLYGON ((6 5, 6 6, 5 6, 5 5, 6 5))
dtype: geometry
>>> s.sindex.query(s2)
array([[0, 0, 0, 1, 1],
[2, 3, 4, 5, 6]])
>>> s.sindex.query(s2, predicate="contains")
array([[0],
[3]])
>>> s.sindex.query(box(1, 1, 3, 3), predicate="dwithin", distance=0)
array([1, 2, 3])
>>> s.sindex.query(box(1, 1, 3, 3), predicate="dwithin", distance=2)
array([0, 1, 2, 3, 4])
Notes
-----
In the context of a spatial join, input geometries are the "left"
geometries that determine the order of the results, and tree geometries
are "right" geometries that are joined against the left geometries. This
effectively performs an inner join, where only those combinations of
geometries that can be joined based on overlapping bounding boxes or
optional predicate are returned.
"""
if predicate not in self.valid_query_predicates:
if predicate == "dwithin":
raise ValueError("predicate = 'dwithin' requires GEOS >= 3.10.0")
raise ValueError(
"Got predicate='{}'; ".format(predicate)
+ "`predicate` must be one of {}".format(self.valid_query_predicates)
)
# distance argument requirement of predicate `dwithin`
# and only valid for predicate `dwithin`
kwargs = {}
if predicate == "dwithin":
if distance is None:
# the distance parameter is needed
raise ValueError(
"'distance' parameter is required for 'dwithin' predicate"
)
# add distance to kwargs
kwargs["distance"] = distance
elif distance is not None:
# distance parameter is invalid
raise ValueError(
"'distance' parameter is only supported in combination with "
"'dwithin' predicate"
)
geometry = self._as_geometry_array(geometry)
indices = self._tree.query(geometry, predicate=predicate, **kwargs)
if output_format != "tuple":
sort = True
if sort:
if indices.ndim == 1:
indices = np.sort(indices)
else:
# sort by first array (geometry) and then second (tree)
geo_idx, tree_idx = indices
sort_indexer = np.lexsort((tree_idx, geo_idx))
indices = np.vstack((geo_idx[sort_indexer], tree_idx[sort_indexer]))
if output_format == "sparse":
from scipy.sparse import coo_array
return coo_array(
(np.ones(len(indices[0]), dtype=np.bool_), indices),
shape=(len(self.geometries), len(geometry)),
dtype=np.bool_,
)
if output_format == "dense":
dense = np.zeros((len(self.geometries), len(geometry)), dtype=bool)
dense[indices] = True
return dense
if output_format == "tuple":
return indices
raise ValueError("Invalid output_format: {}".format(output_format))
@staticmethod
def _as_geometry_array(geometry):
"""Convert geometry into a numpy array of Shapely geometries.
Parameters
----------
geometry
An array-like of Shapely geometries, a GeoPandas GeoSeries/GeometryArray,
shapely.geometry or list of shapely geometries.
Returns
-------
np.ndarray
A numpy array of Shapely geometries.
"""
if isinstance(geometry, np.ndarray):
return array.from_shapely(geometry)._data
elif isinstance(geometry, geoseries.GeoSeries):
return geometry.values._data
elif isinstance(geometry, array.GeometryArray):
return geometry._data
elif isinstance(geometry, BaseGeometry):
return geometry
elif geometry is None:
return None
else:
return np.asarray(geometry)
def nearest(
self,
geometry,
return_all=True,
max_distance=None,
return_distance=False,
exclusive=False,
):
"""
Return the nearest geometry in the tree for each input geometry in
``geometry``.
If multiple tree geometries have the same distance from an input geometry,
multiple results will be returned for that input geometry by default.
Specify ``return_all=False`` to only get a single nearest geometry
(non-deterministic which nearest is returned).
In the context of a spatial join, input geometries are the "left"
geometries that determine the order of the results, and tree geometries
are "right" geometries that are joined against the left geometries.
If ``max_distance`` is not set, this will effectively be a left join
because every geometry in ``geometry`` will have a nearest geometry in
the tree. However, if ``max_distance`` is used, this becomes an
inner join, since some geometries in ``geometry`` may not have a match
in the tree.
For performance reasons, it is highly recommended that you set
the ``max_distance`` parameter.
Parameters
----------
geometry : {shapely.geometry, GeoSeries, GeometryArray, numpy.array of Shapely \
geometries}
A single shapely geometry, one of the GeoPandas geometry iterables
(GeoSeries, GeometryArray), or a numpy array of Shapely geometries to query
against the spatial index.
return_all : bool, default True
If there are multiple equidistant or intersecting nearest
geometries, return all those geometries instead of a single
nearest geometry.
max_distance : float, optional
Maximum distance within which to query for nearest items in tree.
Must be greater than 0. By default None, indicating no distance limit.
return_distance : bool, optional
If True, will return distances in addition to indexes. By default False
exclusive : bool, optional
if True, the nearest geometries that are equal to the input geometry
will not be returned. By default False. Requires Shapely >= 2.0.
Returns
-------
Indices or tuple of (indices, distances)
Indices is an ndarray of shape (2,n) and distances (if present) an
ndarray of shape (n).
The first subarray of indices contains input geometry indices.
The second subarray of indices contains tree geometry indices.
Examples
--------
>>> from shapely.geometry import Point, box
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s.head()
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
dtype: geometry
>>> s.sindex.nearest(Point(1, 1))
array([[0],
[1]])
>>> s.sindex.nearest([box(4.9, 4.9, 5.1, 5.1)])
array([[0],
[5]])
>>> s2 = geopandas.GeoSeries(geopandas.points_from_xy([7.6, 10], [7.6, 10]))
>>> s2
0 POINT (7.6 7.6)
1 POINT (10 10)
dtype: geometry
>>> s.sindex.nearest(s2)
array([[0, 1],
[8, 9]])
"""
geometry = self._as_geometry_array(geometry)
if isinstance(geometry, BaseGeometry) or geometry is None:
geometry = [geometry]
result = self._tree.query_nearest(
geometry,
max_distance=max_distance,
return_distance=return_distance,
all_matches=return_all,
exclusive=exclusive,
)
if return_distance:
indices, distances = result
else:
indices = result
if return_distance:
return indices, distances
else:
return indices
def intersection(self, coordinates):
"""Compatibility wrapper for rtree.index.Index.intersection,
use ``query`` instead.
Parameters
----------
coordinates : sequence or array
Sequence of the form (min_x, min_y, max_x, max_y)
to query a rectangle or (x, y) to query a point.
Examples
--------
>>> from shapely.geometry import Point, box
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
>>> s.sindex.intersection(box(1, 1, 3, 3).bounds)
array([1, 2, 3])
Alternatively, you can use ``query``:
>>> s.sindex.query(box(1, 1, 3, 3))
array([1, 2, 3])
"""
# TODO: we should deprecate this
# convert bounds to geometry
# the old API uses tuples of bound, but Shapely uses geometries
try:
iter(coordinates)
except TypeError:
# likely not an iterable
# this is a check that rtree does, we mimic it
# to ensure a useful failure message
raise TypeError(
"Invalid coordinates, must be iterable in format "
"(minx, miny, maxx, maxy) (for bounds) or (x, y) (for points). "
"Got `coordinates` = {}.".format(coordinates)
)
# need to convert tuple of bounds to a geometry object
if len(coordinates) == 4:
indexes = self._tree.query(shapely.box(*coordinates))
elif len(coordinates) == 2:
indexes = self._tree.query(shapely.points(*coordinates))
else:
raise TypeError(
"Invalid coordinates, must be iterable in format "
"(minx, miny, maxx, maxy) (for bounds) or (x, y) (for points). "
"Got `coordinates` = {}.".format(coordinates)
)
return indexes
@property
def size(self):
"""Size of the spatial index
Number of leaves (input geometries) in the index.
Examples
--------
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
>>> s.sindex.size
10
"""
return len(self._tree)
@property
def is_empty(self):
"""Check if the spatial index is empty
Examples
--------
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
>>> s.sindex.is_empty
False
>>> s2 = geopandas.GeoSeries()
>>> s2.sindex.is_empty
True
"""
return len(self._tree) == 0
def __len__(self):
return len(self._tree)

View File

@@ -1,358 +0,0 @@
"""
Testing functionality for geopandas objects.
"""
import warnings
import pandas as pd
from geopandas import GeoDataFrame, GeoSeries
from geopandas.array import GeometryDtype
def _isna(this):
"""isna version that works for both scalars and (Geo)Series"""
with warnings.catch_warnings():
# GeoSeries.isna will raise a warning about no longer returning True
# for empty geometries. This helper is used below always in combination
# with an is_empty check to preserve behaviour, and thus we ignore the
# warning here to avoid it bubbling up to the user
warnings.filterwarnings(
"ignore", r"GeoSeries.isna\(\) previously returned", UserWarning
)
if hasattr(this, "isna"):
return this.isna()
elif hasattr(this, "isnull"):
return this.isnull()
else:
return pd.isnull(this)
def _geom_equals_mask(this, that):
"""
Test for geometric equality. Empty or missing geometries are considered
equal.
Parameters
----------
this, that : arrays of Geo objects (or anything that has an `is_empty`
attribute)
Returns
-------
Series
boolean Series, True if geometries in left equal geometries in right
"""
return (
this.geom_equals(that)
| (this.is_empty & that.is_empty)
| (_isna(this) & _isna(that))
)
def geom_equals(this, that):
"""
Test for geometric equality. Empty or missing geometries are considered
equal.
Parameters
----------
this, that : arrays of Geo objects (or anything that has an `is_empty`
attribute)
Returns
-------
bool
True if all geometries in left equal geometries in right
"""
return _geom_equals_mask(this, that).all()
def _geom_almost_equals_mask(this, that):
"""
Test for 'almost' geometric equality. Empty or missing geometries
considered equal.
This method allows small difference in the coordinates, but this
requires coordinates be in the same order for all components of a geometry.
Parameters
----------
this, that : arrays of Geo objects
Returns
-------
Series
boolean Series, True if geometries in left almost equal geometries in right
"""
return (
this.geom_equals_exact(that, tolerance=0.5 * 10 ** (-6))
| (this.is_empty & that.is_empty)
| (_isna(this) & _isna(that))
)
def geom_almost_equals(this, that):
"""
Test for 'almost' geometric equality. Empty or missing geometries
considered equal.
This method allows small difference in the coordinates, but this
requires coordinates be in the same order for all components of a geometry.
Parameters
----------
this, that : arrays of Geo objects (or anything that has an `is_empty`
property)
Returns
-------
bool
True if all geometries in left almost equal geometries in right
"""
if isinstance(this, GeoDataFrame) and isinstance(that, GeoDataFrame):
this = this.geometry
that = that.geometry
return _geom_almost_equals_mask(this, that).all()
def assert_geoseries_equal(
left,
right,
check_dtype=True,
check_index_type=False,
check_series_type=True,
check_less_precise=False,
check_geom_type=False,
check_crs=True,
normalize=False,
):
"""
Test util for checking that two GeoSeries are equal.
Parameters
----------
left, right : two GeoSeries
check_dtype : bool, default False
If True, check geo dtype [only included so it's a drop-in replacement
for assert_series_equal].
check_index_type : bool, default False
Check that index types are equal.
check_series_type : bool, default True
Check that both are same type (*and* are GeoSeries). If False,
will attempt to convert both into GeoSeries.
check_less_precise : bool, default False
If True, use geom_equals_exact with relative error of 0.5e-6.
If False, use geom_equals.
check_geom_type : bool, default False
If True, check that all the geom types are equal.
check_crs: bool, default True
If `check_series_type` is True, then also check that the
crs matches.
normalize: bool, default False
If True, normalize the geometries before comparing equality.
Typically useful with ``check_less_precise=True``, which uses
``geom_equals_exact`` and requires exact coordinate order.
"""
assert len(left) == len(right), "%d != %d" % (len(left), len(right))
if check_dtype:
msg = "dtype should be a GeometryDtype, got {0}"
assert isinstance(left.dtype, GeometryDtype), msg.format(left.dtype)
assert isinstance(right.dtype, GeometryDtype), msg.format(left.dtype)
if check_index_type:
assert isinstance(left.index, type(right.index))
if check_series_type:
assert isinstance(left, GeoSeries)
assert isinstance(left, type(right))
if check_crs:
assert left.crs == right.crs
else:
if not isinstance(left, GeoSeries):
left = GeoSeries(left)
if not isinstance(right, GeoSeries):
right = GeoSeries(right, index=left.index)
assert left.index.equals(right.index), "index: %s != %s" % (left.index, right.index)
if check_geom_type:
assert (left.geom_type == right.geom_type).all(), "type: %s != %s" % (
left.geom_type,
right.geom_type,
)
if normalize:
left = GeoSeries(left.array.normalize())
right = GeoSeries(right.array.normalize())
if not check_crs:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "CRS mismatch", UserWarning)
_check_equality(left, right, check_less_precise)
else:
_check_equality(left, right, check_less_precise)
def _truncated_string(geom):
"""Truncated WKT repr of geom"""
s = str(geom)
if len(s) > 100:
return s[:100] + "..."
else:
return s
def _check_equality(left, right, check_less_precise):
assert_error_message = (
"{0} out of {1} geometries are not {3}equal.\n"
"Indices where geometries are not {3}equal: {2} \n"
"The first not {3}equal geometry:\n"
"Left: {4}\n"
"Right: {5}\n"
)
if check_less_precise:
precise = "almost "
equal = _geom_almost_equals_mask(left, right)
else:
precise = ""
equal = _geom_equals_mask(left, right)
if not equal.all():
unequal_left_geoms = left[~equal]
unequal_right_geoms = right[~equal]
raise AssertionError(
assert_error_message.format(
len(unequal_left_geoms),
len(left),
unequal_left_geoms.index.to_list(),
precise,
_truncated_string(unequal_left_geoms.iloc[0]),
_truncated_string(unequal_right_geoms.iloc[0]),
)
)
def assert_geodataframe_equal(
left,
right,
check_dtype=True,
check_index_type="equiv",
check_column_type="equiv",
check_frame_type=True,
check_like=False,
check_less_precise=False,
check_geom_type=False,
check_crs=True,
normalize=False,
):
"""
Check that two GeoDataFrames are equal/
Parameters
----------
left, right : two GeoDataFrames
check_dtype : bool, default True
Whether to check the DataFrame dtype is identical.
check_index_type, check_column_type : bool, default 'equiv'
Check that index types are equal.
check_frame_type : bool, default True
Check that both are same type (*and* are GeoDataFrames). If False,
will attempt to convert both into GeoDataFrame.
check_like : bool, default False
If true, ignore the order of rows & columns
check_less_precise : bool, default False
If True, use geom_equals_exact. if False, use geom_equals.
check_geom_type : bool, default False
If True, check that all the geom types are equal.
check_crs: bool, default True
If `check_frame_type` is True, then also check that the
crs matches.
normalize: bool, default False
If True, normalize the geometries before comparing equality.
Typically useful with ``check_less_precise=True``, which uses
``geom_equals_exact`` and requires exact coordinate order.
"""
try:
# added from pandas 0.20
from pandas.testing import assert_frame_equal, assert_index_equal
except ImportError:
from pandas.util.testing import assert_frame_equal, assert_index_equal
# instance validation
if check_frame_type:
assert isinstance(left, GeoDataFrame)
assert isinstance(left, type(right))
if check_crs:
# allow if neither left and right has an active geometry column
if (
left._geometry_column_name is None
and right._geometry_column_name is None
):
pass
elif (
left._geometry_column_name not in left.columns
and right._geometry_column_name not in right.columns
):
pass
# no crs can be either None or {}
elif not left.crs and not right.crs:
pass
else:
assert left.crs == right.crs
else:
if not isinstance(left, GeoDataFrame):
left = GeoDataFrame(left)
if not isinstance(right, GeoDataFrame):
right = GeoDataFrame(right)
# shape comparison
assert left.shape == right.shape, (
"GeoDataFrame shape mismatch, left: {lshape!r}, right: {rshape!r}.\n"
"Left columns: {lcols!r}, right columns: {rcols!r}"
).format(
lshape=left.shape, rshape=right.shape, lcols=left.columns, rcols=right.columns
)
if check_like:
left = left.reindex_like(right)
# column comparison
assert_index_equal(
left.columns, right.columns, exact=check_column_type, obj="GeoDataFrame.columns"
)
# geometry comparison
for col, dtype in left.dtypes.items():
if isinstance(dtype, GeometryDtype):
assert_geoseries_equal(
left[col],
right[col],
normalize=normalize,
check_dtype=check_dtype,
check_less_precise=check_less_precise,
check_geom_type=check_geom_type,
check_crs=check_crs,
)
# ensure the active geometry column is the same
assert left._geometry_column_name == right._geometry_column_name
# drop geometries and check remaining columns
left2 = left.select_dtypes(exclude="geometry")
right2 = right.select_dtypes(exclude="geometry")
assert_frame_equal(
left2,
right2,
check_dtype=check_dtype,
check_index_type=check_index_type,
check_column_type=check_column_type,
obj="GeoDataFrame",
)

View File

@@ -1,9 +0,0 @@
{
"type": "FeatureCollection",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "Name": "Null Geometry" }, "geometry": null },
{ "type": "Feature", "properties": { "Name": "SF to NY" }, "geometry": { "type": "LineString", "coordinates": [ [ -122.4051293283311, 37.786780113640894 ], [ -73.859832357849271, 40.487594916296196 ] ] } }
]
}

View File

@@ -1,38 +0,0 @@
import subprocess
import sys
def test_no_additional_imports():
# test that 'import geopandas' does not import any of the optional or
# development dependencies
blacklist = {
"pytest",
"py",
"ipython",
# fiona actually gets imported if installed (but error suppressed until used)
# "fiona",
# "matplotlib", # matplotlib gets imported by pandas, see below
"mapclassify",
"sqlalchemy",
"psycopg",
"psycopg2",
"geopy",
"geoalchemy2",
"matplotlib",
}
code = """
import sys
import geopandas
blacklist = {0!r}
mods = blacklist & set(m.split('.')[0] for m in sys.modules)
if mods:
sys.stderr.write('err: geopandas should not import: {{}}'.format(', '.join(mods)))
sys.exit(len(mods))
""".format(
blacklist
)
call = [sys.executable, "-c", code]
returncode = subprocess.run(call, check=False).returncode
assert returncode == 0

View File

@@ -1,30 +0,0 @@
from geopandas._compat import import_optional_dependency
import pytest
def test_import_optional_dependency_present():
# pandas is not optional, but we know it is present
pandas = import_optional_dependency("pandas")
assert pandas is not None
# module imported normally must be same
import pandas as pd
assert pandas == pd
def test_import_optional_dependency_absent():
with pytest.raises(ImportError, match="Missing optional dependency 'foo'"):
import_optional_dependency("foo")
with pytest.raises(ImportError, match="foo is required"):
import_optional_dependency("foo", extra="foo is required")
@pytest.mark.parametrize(
"bad_import", [["foo"], 0, False, True, {}, {"foo"}, {"foo": "bar"}]
)
def test_import_optional_dependency_invalid(bad_import):
with pytest.raises(ValueError, match="Invalid module name"):
import_optional_dependency(bad_import)

View File

@@ -1,47 +0,0 @@
import geopandas
import pytest
def test_options():
assert "display_precision: " in repr(geopandas.options)
assert set(dir(geopandas.options)) == {
"display_precision",
"use_pygeos",
"io_engine",
}
with pytest.raises(AttributeError):
geopandas.options.non_existing_option
with pytest.raises(AttributeError):
geopandas.options.non_existing_option = 10
def test_options_display_precision():
assert geopandas.options.display_precision is None
geopandas.options.display_precision = 5
assert geopandas.options.display_precision == 5
with pytest.raises(ValueError):
geopandas.options.display_precision = "abc"
with pytest.raises(ValueError):
geopandas.options.display_precision = -1
geopandas.options.display_precision = None
def test_options_io_engine():
assert geopandas.options.io_engine is None
geopandas.options.io_engine = "pyogrio"
assert geopandas.options.io_engine == "pyogrio"
with pytest.raises(ValueError):
geopandas.options.io_engine = "abc"
with pytest.raises(ValueError):
geopandas.options.io_engine = -1
geopandas.options.io_engine = None

View File

@@ -1,747 +0,0 @@
import random
import warnings
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point, Polygon
from geopandas import GeoDataFrame, GeoSeries, points_from_xy, read_file
from geopandas.array import GeometryArray, from_shapely, from_wkb, from_wkt
import pytest
from geopandas.testing import assert_geodataframe_equal
pyproj = pytest.importorskip("pyproj")
def _create_df(x, y=None, crs=None):
y = y or x
x = np.asarray(x)
y = np.asarray(y)
return GeoDataFrame(
{"geometry": points_from_xy(x, y), "value1": x + y, "value2": x * y}, crs=crs
)
def df_epsg26918():
# EPSG:26918
# Center coordinates
# -1683723.64 6689139.23
return _create_df(
x=range(-1683723, -1683723 + 10, 1),
y=range(6689139, 6689139 + 10, 1),
crs="epsg:26918",
)
def test_to_crs_transform():
df = df_epsg26918()
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_transform__missing_data():
# https://github.com/geopandas/geopandas/issues/1573
df = df_epsg26918()
df.loc[3, "geometry"] = None
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_transform__empty_data():
df = df_epsg26918().iloc[:0]
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_inplace():
df = df_epsg26918()
lonlat = df.to_crs(epsg=4326)
df.to_crs(epsg=4326, inplace=True)
assert_geodataframe_equal(df, lonlat, check_less_precise=True)
def test_to_crs_geo_column_name():
# Test to_crs() with different geometry column name (GH#339)
df = df_epsg26918()
df = df.rename(columns={"geometry": "geom"})
df.set_geometry("geom", inplace=True)
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert lonlat.geometry.name == "geom"
assert utm.geometry.name == "geom"
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_dimension_z():
# preserve z dimension
arr = points_from_xy([1, 2], [2, 3], [3, 4], crs=4326)
assert arr.has_z.all()
result = arr.to_crs(epsg=3857)
assert result.has_z.all()
# pyproj + numpy 1.25 trigger warning for single-element array -> recommdation is to
# ignore the warning for now (https://github.com/pyproj4/pyproj/issues/1307)
@pytest.mark.filterwarnings("ignore:Conversion of an array with:DeprecationWarning")
def test_to_crs_dimension_mixed():
s = GeoSeries([Point(1, 2), LineString([(1, 2, 3), (4, 5, 6)])], crs=2056)
result = s.to_crs(epsg=4326)
assert not result[0].is_empty
assert result.has_z.tolist() == [False, True]
roundtrip = result.to_crs(epsg=2056)
# TODO replace with assert_geoseries_equal once we expose tolerance keyword
# assert_geoseries_equal(roundtrip, s, check_less_precise=True)
for a, b in zip(roundtrip, s):
np.testing.assert_allclose(a.coords[:], b.coords[:], atol=0.01)
# -----------------------------------------------------------------------------
# Test different supported formats for CRS specification
@pytest.fixture(
params=[
4326,
"epsg:4326",
pytest.param(
{"init": "epsg:4326"},
),
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs",
{"proj": "latlong", "ellps": "WGS84", "datum": "WGS84", "no_defs": True},
],
ids=["epsg_number", "epsg_string", "epsg_dict", "proj4_string", "proj4_dict"],
)
def epsg4326(request):
if isinstance(request.param, int):
return {"epsg": request.param}
return {"crs": request.param}
@pytest.fixture(
params=[
26918,
"epsg:26918",
pytest.param(
{"init": "epsg:26918", "no_defs": True},
),
"+proj=utm +zone=18 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ",
{"proj": "utm", "zone": 18, "datum": "NAD83", "units": "m", "no_defs": True},
],
ids=["epsg_number", "epsg_string", "epsg_dict", "proj4_string", "proj4_dict"],
)
def epsg26918(request):
if isinstance(request.param, int):
return {"epsg": request.param}
return {"crs": request.param}
@pytest.mark.filterwarnings("ignore:'\\+init:DeprecationWarning")
@pytest.mark.filterwarnings("ignore:'\\+init:FutureWarning")
def test_transform2(epsg4326, epsg26918):
# with PROJ >= 7, the transformation using EPSG code vs proj4 string is
# slightly different due to use of grid files or not -> turn off network
# to not use grid files at all for this test
pyproj.network.set_network_enabled(False)
df = df_epsg26918()
lonlat = df.to_crs(**epsg4326)
utm = lonlat.to_crs(**epsg26918)
# can't check for CRS equality, as the formats differ although representing
# the same CRS
assert_geodataframe_equal(df, utm, check_less_precise=True, check_crs=False)
# pyproj + numpy 1.25 trigger warning for single-element array -> recommdation is to
# ignore the warning for now (https://github.com/pyproj4/pyproj/issues/1307)
@pytest.mark.filterwarnings("ignore:Conversion of an array with:DeprecationWarning")
def test_crs_axis_order__always_xy():
df = GeoDataFrame(geometry=[Point(-1683723, 6689139)], crs="epsg:26918")
lonlat = df.to_crs("epsg:4326")
test_lonlat = GeoDataFrame(
geometry=[Point(-110.1399901, 55.1350011)], crs="epsg:4326"
)
assert_geodataframe_equal(lonlat, test_lonlat, check_less_precise=True)
def test_skip_exact_same():
df = df_epsg26918()
utm = df.to_crs(df.crs)
assert_geodataframe_equal(df, utm, check_less_precise=True)
# Test CRS on GeometryArray level
class TestGeometryArrayCRS:
def setup_method(self):
self.osgb = pyproj.CRS(27700)
self.wgs = pyproj.CRS(4326)
self.geoms = [Point(0, 0), Point(1, 1)]
self.polys = [
Polygon([(random.random(), random.random()) for i in range(3)])
for _ in range(10)
]
self.arr = from_shapely(self.polys, crs=27700)
def test_array(self):
arr = from_shapely(self.geoms)
arr.crs = 27700
assert arr.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
assert arr.crs == self.osgb
arr = GeometryArray(arr)
assert arr.crs == self.osgb
arr = GeometryArray(arr, crs=4326)
assert arr.crs == self.wgs
def test_series(self):
s = GeoSeries(crs=27700)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
# manually change CRS
s = s.set_crs(4326, allow_override=True)
assert s.crs == self.wgs
assert s.values.crs == self.wgs
s = GeoSeries(self.geoms, crs=27700)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(arr)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
with pytest.raises(
ValueError,
match="CRS mismatch between CRS of the passed geometries and 'crs'",
):
s = GeoSeries(arr, crs=4326)
assert s.crs == self.osgb
def test_dataframe(self):
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame(geometry=arr)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame(geometry=s)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# different passed CRS than array CRS is now an error
match_str = "CRS mismatch between CRS of the passed geometries and 'crs'"
with pytest.raises(ValueError, match=match_str):
df = GeoDataFrame(geometry=s, crs=4326)
with pytest.raises(ValueError, match=match_str):
GeoDataFrame(geometry=s, crs=4326)
with pytest.raises(ValueError, match=match_str):
GeoDataFrame({"data": [1, 2], "geometry": s}, crs=4326)
with pytest.raises(ValueError, match=match_str):
GeoDataFrame(df, crs=4326).crs
# manually change CRS
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame(geometry=s)
df = df.set_crs(crs="epsg:4326", allow_override=True)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
with pytest.raises(ValueError, match="Assigning CRS to a GeoDataFrame without"):
GeoDataFrame(self.geoms, columns=["geom"], crs=27700)
with pytest.raises(ValueError, match="Assigning CRS to a GeoDataFrame without"):
GeoDataFrame(crs=27700)
df = GeoDataFrame(self.geoms, columns=["geom"])
df = df.set_geometry("geom", crs=27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
assert df.geom.crs == self.osgb
assert df.geom.values.crs == self.osgb
df = GeoDataFrame(geometry=self.geoms, crs=27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# new geometry with set CRS has priority over GDF CRS
df = GeoDataFrame(geometry=self.geoms, crs=27700)
df = df.set_geometry(self.geoms, crs=4326)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame()
df = df.set_geometry(s)
assert df._geometry_column_name == "geometry"
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame()
df = df.set_geometry(arr)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr}, crs=4326)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
arr = from_shapely(self.geoms, crs=4326)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr})
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
# geometry column name None on init
df = GeoDataFrame({"geometry": [0, 1]})
with pytest.raises(
ValueError,
match="Assigning CRS to a GeoDataFrame without a geometry",
):
df.crs = 27700
# geometry column without geometry
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "Geometry column does not contain geometry", UserWarning
)
df = GeoDataFrame({"geometry": [Point(0, 1)]}).assign(geometry=[0])
with pytest.raises(
ValueError,
match="Assigning CRS to a GeoDataFrame without an active geometry",
):
df.crs = 27700
with pytest.raises(
AttributeError,
match="The CRS attribute of a GeoDataFrame without an active",
):
assert df.crs == self.osgb
def test_dataframe_getitem_without_geometry_column(self):
df = GeoDataFrame({"col": range(10)}, geometry=self.arr)
df["geom2"] = df.geometry.centroid
subset = df[["col", "geom2"]]
with pytest.raises(
AttributeError,
match="The CRS attribute of a GeoDataFrame without an active",
):
assert subset.crs == self.osgb
def test_dataframe_setitem(self):
# new geometry CRS has priority over GDF CRS
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame()
with pytest.warns(
FutureWarning, match="You are adding a column named 'geometry'"
):
df["geometry"] = s
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame()
with pytest.warns(
FutureWarning, match="You are adding a column named 'geometry'"
):
df["geometry"] = arr
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# test to_crs case (GH1960)
arr = from_shapely(self.geoms)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr}, crs=4326)
df["geometry"] = df["geometry"].to_crs(27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# test changing geometry crs not in the geometry column doesn't change the crs
arr = from_shapely(self.geoms)
df = GeoDataFrame(
{"col1": [1, 2], "geometry": arr, "other_geom": arr}, crs=4326
)
df["other_geom"] = from_shapely(self.geoms, crs=27700)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df["geometry"].crs == self.wgs
assert df["other_geom"].crs == self.osgb
def test_dataframe_setitem_without_geometry_column(self):
arr = from_shapely(self.geoms)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr}, crs=4326)
# override geometry with non geometry
with pytest.warns(UserWarning):
df["geometry"] = 1
# assigning a list of geometry object doesn't have cached access to 4326
df["geometry"] = self.geoms
assert df.crs is None
@pytest.mark.parametrize(
"scalar", [None, Point(0, 0), LineString([(0, 0), (1, 1)])]
)
def test_scalar(self, scalar):
df = GeoDataFrame()
with pytest.warns(
FutureWarning, match="You are adding a column named 'geometry'"
):
df["geometry"] = scalar
df = df.set_crs(4326)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
@pytest.mark.filterwarnings("ignore:Accessing CRS")
def test_crs_with_no_geom_fails(self):
with pytest.raises(ValueError, match="Assigning CRS to a GeoDataFrame without"):
df = GeoDataFrame()
df.crs = 4326
def test_read_file(self, nybb_filename):
df = read_file(nybb_filename)
assert df.crs == pyproj.CRS(2263)
assert df.geometry.crs == pyproj.CRS(2263)
assert df.geometry.values.crs == pyproj.CRS(2263)
def test_multiple_geoms(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
assert df.col1.crs == self.wgs
assert df.col1.values.crs == self.wgs
def test_multiple_geoms_set_geom(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
df = df.set_geometry("col1")
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
assert df["geometry"].crs == self.osgb
assert df["geometry"].values.crs == self.osgb
def test_assign_cols(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
df["geom2"] = s
df["geom3"] = s.values
df["geom4"] = from_shapely(self.geoms)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
assert df.geom2.crs == self.wgs
assert df.geom2.values.crs == self.wgs
assert df.geom3.crs == self.wgs
assert df.geom3.values.crs == self.wgs
assert df.geom4.crs is None
assert df.geom4.values.crs is None
def test_copy(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
arr_copy = arr.copy()
assert arr_copy.crs == arr.crs
s_copy = s.copy()
assert s_copy.crs == s.crs
assert s_copy.values.crs == s.values.crs
df_copy = df.copy()
assert df_copy.crs == df.crs
assert df_copy.geometry.crs == df.geometry.crs
assert df_copy.geometry.values.crs == df.geometry.values.crs
assert df_copy.col1.crs == df.col1.crs
assert df_copy.col1.values.crs == df.col1.values.crs
def test_rename(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
df = df.rename(columns={"geometry": "geom"}).set_geometry("geom")
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
df = df.rename_geometry("geom2")
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
df = df.rename(columns={"col1": "column1"})
assert df.column1.crs == self.wgs
assert df.column1.values.crs == self.wgs
def test_geoseries_to_crs(self):
s = GeoSeries(self.geoms, crs=27700)
s = s.to_crs(4326)
assert s.crs == self.wgs
assert s.values.crs == self.wgs
df = GeoDataFrame(geometry=s)
assert df.crs == self.wgs
df = df.to_crs(27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# make sure that only active geometry is transformed
arr = from_shapely(self.geoms, crs=4326)
df["col1"] = arr
df = df.to_crs(3857)
assert df.col1.crs == self.wgs
assert df.col1.values.crs == self.wgs
def test_array_to_crs(self):
arr = from_shapely(self.geoms, crs=27700)
arr = arr.to_crs(4326)
assert arr.crs == self.wgs
def test_from_shapely(self):
arr = from_shapely(self.geoms, crs=27700)
assert arr.crs == self.osgb
def test_from_wkb(self):
L_wkb = [p.wkb for p in self.geoms]
arr = from_wkb(L_wkb, crs=27700)
assert arr.crs == self.osgb
def test_from_wkt(self):
L_wkt = [p.wkt for p in self.geoms]
arr = from_wkt(L_wkt, crs=27700)
assert arr.crs == self.osgb
def test_points_from_xy(self):
df = pd.DataFrame([{"x": x, "y": x, "z": x} for x in range(10)])
arr = points_from_xy(df["x"], df["y"], crs=27700)
assert arr.crs == self.osgb
# setting CRS in GeoSeries should not set it in passed array without CRS
def test_original(self):
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
assert arr.crs is None
assert s.crs == self.osgb
def test_ops(self):
arr = self.arr
bound = arr.boundary
assert bound.crs == self.osgb
cent = arr.centroid
assert cent.crs == self.osgb
hull = arr.convex_hull
assert hull.crs == self.osgb
envelope = arr.envelope
assert envelope.crs == self.osgb
exterior = arr.exterior
assert exterior.crs == self.osgb
representative_point = arr.representative_point()
assert representative_point.crs == self.osgb
def test_binary_ops(self):
arr = self.arr
quads = []
while len(quads) < 10:
geom = Polygon([(random.random(), random.random()) for i in range(4)])
if geom.is_valid:
quads.append(geom)
arr2 = from_shapely(quads, crs=27700)
difference = arr.difference(arr2)
assert difference.crs == self.osgb
intersection = arr.intersection(arr2)
assert intersection.crs == self.osgb
symmetric_difference = arr.symmetric_difference(arr2)
assert symmetric_difference.crs == self.osgb
union = arr.union(arr2)
assert union.crs == self.osgb
def test_other(self):
arr = self.arr
buffer = arr.buffer(5)
assert buffer.crs == self.osgb
interpolate = arr.exterior.interpolate(0.1)
assert interpolate.crs == self.osgb
simplify = arr.simplify(5)
assert simplify.crs == self.osgb
@pytest.mark.parametrize(
"attr, arg",
[
("affine_transform", ([0, 1, 1, 0, 0, 0],)),
("translate", ()),
("rotate", (10,)),
("scale", ()),
("skew", ()),
],
)
def test_affinity_methods(self, attr, arg):
result = getattr(self.arr, attr)(*arg)
assert result.crs == self.osgb
def test_slice(self):
s = GeoSeries(self.arr, crs=27700)
assert s.iloc[1:].values.crs == self.osgb
df = GeoDataFrame({"col1": self.arr}, geometry=s)
assert df.iloc[1:].geometry.values.crs == self.osgb
assert df.iloc[1:].col1.values.crs == self.osgb
def test_concat(self):
s = GeoSeries(self.arr, crs=27700)
assert pd.concat([s, s]).values.crs == self.osgb
df = GeoDataFrame({"col1": from_shapely(self.geoms, crs=4326)}, geometry=s)
assert pd.concat([df, df]).geometry.values.crs == self.osgb
assert pd.concat([df, df]).col1.values.crs == self.wgs
def test_merge(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame({"col1": s}, geometry=arr)
df2 = GeoDataFrame({"col2": s}, geometry=arr).rename_geometry("geom")
merged = df.merge(df2, left_index=True, right_index=True)
assert merged.col1.values.crs == self.wgs
assert merged.geometry.values.crs == self.osgb
assert merged.col2.values.crs == self.wgs
assert merged.geom.values.crs == self.osgb
assert merged.crs == self.osgb
# make sure that geometry column from list has CRS (__setitem__)
def test_setitem_geometry(self):
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame({"col1": [0, 1]}, geometry=arr)
df["geometry"] = list(df.geometry)
assert df.geometry.values.crs == self.osgb
df2 = GeoDataFrame({"col1": [0, 1]}, geometry=arr)
df2["geometry"] = from_shapely(self.geoms, crs=4326)
assert df2.geometry.values.crs == self.wgs
def test_astype(self):
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame({"col1": [0, 1]}, geometry=arr)
df2 = df.astype({"col1": str})
assert df2.crs == self.osgb
def test_apply(self):
s = GeoSeries(self.arr)
assert s.crs == 27700
# apply preserves the CRS if the result is a GeoSeries
result = s.apply(lambda x: x.centroid)
assert result.crs == 27700
def test_apply_geodataframe(self):
df = GeoDataFrame({"col1": [0, 1]}, geometry=self.geoms, crs=27700)
assert df.crs == 27700
# apply preserves the CRS if the result is a GeoDataFrame
result = df.apply(lambda col: col, axis=0)
assert result.crs == 27700
result = df.apply(lambda row: row, axis=1)
assert result.crs == 27700
class TestSetCRS:
@pytest.mark.parametrize(
"constructor",
[
lambda geoms, crs: GeoSeries(geoms, crs=crs),
lambda geoms, crs: GeoDataFrame(geometry=geoms, crs=crs),
],
ids=["geoseries", "geodataframe"],
)
def test_set_crs(self, constructor):
naive = constructor([Point(0, 0), Point(1, 1)], crs=None)
assert naive.crs is None
# by default returns a copy
result = naive.set_crs(crs="EPSG:4326")
assert result.crs == "EPSG:4326"
assert naive.crs is None
result = naive.set_crs(epsg=4326)
assert result.crs == "EPSG:4326"
assert naive.crs is None
# with inplace=True
result = naive.set_crs(crs="EPSG:4326", inplace=True)
assert result is naive
assert result.crs == naive.crs == "EPSG:4326"
# raise for non-naive when crs would be overridden
non_naive = constructor([Point(0, 0), Point(1, 1)], crs="EPSG:4326")
assert non_naive.crs == "EPSG:4326"
with pytest.raises(ValueError, match="already has a CRS"):
non_naive.set_crs("EPSG:3857")
# allow for equal crs
result = non_naive.set_crs("EPSG:4326")
assert result.crs == "EPSG:4326"
# replace with allow_override=True
result = non_naive.set_crs("EPSG:3857", allow_override=True)
assert non_naive.crs == "EPSG:4326"
assert result.crs == "EPSG:3857"
result = non_naive.set_crs("EPSG:3857", allow_override=True, inplace=True)
assert non_naive.crs == "EPSG:3857"
assert result.crs == "EPSG:3857"
# set CRS to None
result = non_naive.set_crs(crs=None, allow_override=True)
assert result.crs is None
assert non_naive.crs == "EPSG:3857"

View File

@@ -1,15 +0,0 @@
from geopandas import GeoDataFrame, read_file
from geopandas.datasets import get_path
import pytest
@pytest.mark.parametrize(
"test_dataset", ["naturalearth_lowres", "naturalearth_cities", "nybb", "foo"]
)
def test_read_paths(test_dataset):
with pytest.raises(
AttributeError,
match=r"The geopandas\.dataset has been deprecated and was removed",
):
assert isinstance(read_file(get_path(test_dataset)), GeoDataFrame)

View File

@@ -1,87 +0,0 @@
from textwrap import dedent
from geopandas._decorator import doc
@doc(method="cumsum", operation="sum")
def cumsum(whatever):
"""
This is the {method} method.
It computes the cumulative {operation}.
"""
@doc(
cumsum,
dedent(
"""
Examples
--------
>>> cumavg([1, 2, 3])
2
"""
),
method="cumavg",
operation="average",
)
def cumavg(whatever): ...
@doc(cumsum, method="cummax", operation="maximum")
def cummax(whatever): ...
@doc(cummax, method="cummin", operation="minimum")
def cummin(whatever): ...
def test_docstring_formatting():
docstr = dedent(
"""
This is the cumsum method.
It computes the cumulative sum.
"""
)
assert cumsum.__doc__ == docstr
def test_docstring_appending():
docstr = dedent(
"""
This is the cumavg method.
It computes the cumulative average.
Examples
--------
>>> cumavg([1, 2, 3])
2
"""
)
assert cumavg.__doc__ == docstr
def test_doc_template_from_func():
docstr = dedent(
"""
This is the cummax method.
It computes the cumulative maximum.
"""
)
assert cummax.__doc__ == docstr
def test_inherit_doc_template():
docstr = dedent(
"""
This is the cummin method.
It computes the cumulative minimum.
"""
)
assert cummin.__doc__ == docstr

View File

@@ -1,372 +0,0 @@
import warnings
import numpy as np
import pandas as pd
import geopandas
from geopandas import GeoDataFrame, read_file
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_15, PANDAS_GE_20, PANDAS_GE_30
import pytest
from geopandas.testing import assert_geodataframe_equal, geom_almost_equals
from pandas.testing import assert_frame_equal
@pytest.fixture
def nybb_polydf(nybb_filename):
nybb_polydf = read_file(nybb_filename)
nybb_polydf = nybb_polydf[["geometry", "BoroName", "BoroCode"]]
nybb_polydf = nybb_polydf.rename(columns={"geometry": "myshapes"})
nybb_polydf = nybb_polydf.set_geometry("myshapes")
nybb_polydf["manhattan_bronx"] = 5
nybb_polydf.loc[3:4, "manhattan_bronx"] = 6
nybb_polydf["BoroCode"] = nybb_polydf["BoroCode"].astype("int64")
return nybb_polydf
@pytest.fixture
def merged_shapes(nybb_polydf):
# Merged geometry
manhattan_bronx = nybb_polydf.loc[3:4]
others = nybb_polydf.loc[0:2]
collapsed = [others.geometry.union_all(), manhattan_bronx.geometry.union_all()]
merged_shapes = GeoDataFrame(
{"myshapes": collapsed},
geometry="myshapes",
index=pd.Index([5, 6], name="manhattan_bronx"),
crs=nybb_polydf.crs,
)
return merged_shapes
@pytest.fixture
def first(merged_shapes):
first = merged_shapes.copy()
first["BoroName"] = ["Staten Island", "Manhattan"]
first["BoroCode"] = [5, 1]
return first
@pytest.fixture
def expected_mean(merged_shapes):
test_mean = merged_shapes.copy()
test_mean["BoroCode"] = [4, 1.5]
return test_mean
def test_geom_dissolve(nybb_polydf, first):
test = nybb_polydf.dissolve("manhattan_bronx")
assert test.geometry.name == "myshapes"
assert geom_almost_equals(test, first)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
def test_dissolve_retains_existing_crs(nybb_polydf):
assert nybb_polydf.crs is not None
test = nybb_polydf.dissolve("manhattan_bronx")
assert test.crs is not None
def test_dissolve_retains_nonexisting_crs(nybb_polydf):
nybb_polydf.geometry.array.crs = None
test = nybb_polydf.dissolve("manhattan_bronx")
assert test.crs is None
def test_first_dissolve(nybb_polydf, first):
test = nybb_polydf.dissolve("manhattan_bronx")
assert_frame_equal(first, test, check_column_type=False)
def test_mean_dissolve(nybb_polydf, first, expected_mean):
if not PANDAS_GE_15:
test = nybb_polydf.dissolve("manhattan_bronx", aggfunc="mean")
test2 = nybb_polydf.dissolve("manhattan_bronx", aggfunc=np.mean)
elif PANDAS_GE_15 and not PANDAS_GE_20:
with pytest.warns(FutureWarning, match=".*used in dissolve is deprecated.*"):
test = nybb_polydf.dissolve("manhattan_bronx", aggfunc="mean")
test2 = nybb_polydf.dissolve("manhattan_bronx", aggfunc=np.mean)
else: # pandas 2.0
test = nybb_polydf.dissolve(
"manhattan_bronx", aggfunc="mean", numeric_only=True
)
# for non pandas "mean", numeric only cannot be applied. Drop columns manually
test2 = nybb_polydf.drop(columns=["BoroName"]).dissolve(
"manhattan_bronx", aggfunc="mean"
)
assert_frame_equal(expected_mean, test, check_column_type=False)
assert_frame_equal(expected_mean, test2, check_column_type=False)
@pytest.mark.skipif(not PANDAS_GE_15 or PANDAS_GE_20, reason="warning for pandas 1.5.x")
def test_mean_dissolve_warning_capture(nybb_polydf, first, expected_mean):
with pytest.warns(
FutureWarning,
match=".*used in dissolve is deprecated.*",
):
nybb_polydf.dissolve("manhattan_bronx", aggfunc="mean")
# test no warning for aggfunc first which doesn't have numeric only semantics
with warnings.catch_warnings():
warnings.simplefilter("error")
nybb_polydf.dissolve("manhattan_bronx", aggfunc="first")
def test_dissolve_emits_other_warnings(nybb_polydf):
# we only do something special for pandas 1.5.x, but expect this
# test to be true on any version
def sum_and_warn(group):
warnings.warn("foo") # noqa: B028
if PANDAS_GE_20:
return group.sum(numeric_only=False)
else:
return group.sum()
with pytest.warns(UserWarning, match="foo"):
nybb_polydf.dissolve("manhattan_bronx", aggfunc=sum_and_warn)
def test_multicolumn_dissolve(nybb_polydf, first):
multi = nybb_polydf.copy()
multi["dup_col"] = multi.manhattan_bronx
multi_test = multi.dissolve(["manhattan_bronx", "dup_col"], aggfunc="first")
first_copy = first.copy()
first_copy["dup_col"] = first_copy.index
first_copy = first_copy.set_index([first_copy.index, "dup_col"])
assert_frame_equal(multi_test, first_copy, check_column_type=False)
def test_reset_index(nybb_polydf, first):
test = nybb_polydf.dissolve("manhattan_bronx", as_index=False)
comparison = first.reset_index()
assert_frame_equal(comparison, test, check_column_type=False)
def test_dissolve_none(nybb_polydf):
test = nybb_polydf.dissolve(by=None)
expected = GeoDataFrame(
{
nybb_polydf.geometry.name: [nybb_polydf.geometry.union_all()],
"BoroName": ["Staten Island"],
"BoroCode": [5],
"manhattan_bronx": [5],
},
geometry=nybb_polydf.geometry.name,
crs=nybb_polydf.crs,
)
assert_frame_equal(expected, test, check_column_type=False)
def test_dissolve_none_mean(nybb_polydf):
test = nybb_polydf.dissolve(aggfunc="mean", numeric_only=True)
expected = GeoDataFrame(
{
nybb_polydf.geometry.name: [nybb_polydf.geometry.union_all()],
"BoroCode": [3.0],
"manhattan_bronx": [5.4],
},
geometry=nybb_polydf.geometry.name,
crs=nybb_polydf.crs,
)
assert_frame_equal(expected, test, check_column_type=False)
def test_dissolve_level():
gdf = geopandas.GeoDataFrame(
{
"a": [1, 1, 2, 2],
"b": [3, 4, 4, 4],
"c": [3, 4, 5, 6],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)", "POINT (3 3)"]
),
}
).set_index(["a", "b", "c"])
expected_a = geopandas.GeoDataFrame(
{
"a": [1, 2],
"geometry": geopandas.array.from_wkt(
["MULTIPOINT (0 0, 1 1)", "MULTIPOINT (2 2, 3 3)"]
),
}
).set_index("a")
expected_b = geopandas.GeoDataFrame(
{
"b": [3, 4],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "MULTIPOINT (1 1, 2 2, 3 3)"]
),
}
).set_index("b")
expected_ab = geopandas.GeoDataFrame(
{
"a": [1, 1, 2],
"b": [3, 4, 4],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "MULTIPOINT (2 2, 3 3)"]
),
}
).set_index(["a", "b"])
assert_frame_equal(expected_a, gdf.dissolve(level=0))
assert_frame_equal(expected_a, gdf.dissolve(level="a"))
assert_frame_equal(expected_b, gdf.dissolve(level=1))
assert_frame_equal(expected_b, gdf.dissolve(level="b"))
assert_frame_equal(expected_ab, gdf.dissolve(level=[0, 1]))
assert_frame_equal(expected_ab, gdf.dissolve(level=["a", "b"]))
def test_dissolve_sort():
gdf = geopandas.GeoDataFrame(
{
"a": [2, 1, 1],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)"]
),
}
)
expected_unsorted = geopandas.GeoDataFrame(
{
"a": [2, 1],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "MULTIPOINT (1 1, 2 2)"]
),
}
).set_index("a")
expected_sorted = expected_unsorted.sort_index()
assert_frame_equal(expected_sorted, gdf.dissolve("a"))
assert_frame_equal(expected_unsorted, gdf.dissolve("a", sort=False))
def test_dissolve_categorical():
gdf = geopandas.GeoDataFrame(
{
"cat": pd.Categorical(["a", "a", "b", "b"]),
"noncat": [1, 1, 1, 2],
"to_agg": [1, 2, 3, 4],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)", "POINT (3 3)"]
),
}
)
# when observed=False we get an additional observation
# that wasn't in the original data
none_val = "GEOMETRYCOLLECTION EMPTY" if PANDAS_GE_30 else None
expected_gdf_observed_false = geopandas.GeoDataFrame(
{
"cat": pd.Categorical(["a", "a", "b", "b"]),
"noncat": [1, 2, 1, 2],
"geometry": geopandas.array.from_wkt(
[
"MULTIPOINT (0 0, 1 1)",
none_val,
"POINT (2 2)",
"POINT (3 3)",
]
),
"to_agg": [1, None, 3, 4],
}
).set_index(["cat", "noncat"])
# when observed=True we do not get any additional observations
expected_gdf_observed_true = geopandas.GeoDataFrame(
{
"cat": pd.Categorical(["a", "b", "b"]),
"noncat": [1, 1, 2],
"geometry": geopandas.array.from_wkt(
["MULTIPOINT (0 0, 1 1)", "POINT (2 2)", "POINT (3 3)"]
),
"to_agg": [1, 3, 4],
}
).set_index(["cat", "noncat"])
assert_frame_equal(expected_gdf_observed_false, gdf.dissolve(["cat", "noncat"]))
assert_frame_equal(
expected_gdf_observed_true, gdf.dissolve(["cat", "noncat"], observed=True)
)
def test_dissolve_dropna():
gdf = geopandas.GeoDataFrame(
{
"a": [1, 1, None],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)"]
),
}
)
expected_with_na = geopandas.GeoDataFrame(
{
"a": [1.0, np.nan],
"geometry": geopandas.array.from_wkt(
["MULTIPOINT (0 0, 1 1)", "POINT (2 2)"]
),
}
).set_index("a")
expected_no_na = geopandas.GeoDataFrame(
{
"a": [1.0],
"geometry": geopandas.array.from_wkt(["MULTIPOINT (0 0, 1 1)"]),
}
).set_index("a")
assert_frame_equal(expected_with_na, gdf.dissolve("a", dropna=False))
assert_frame_equal(expected_no_na, gdf.dissolve("a"))
def test_dissolve_dropna_warn(nybb_polydf):
# No warning with default params
with warnings.catch_warnings(record=True) as record:
nybb_polydf.dissolve()
for r in record:
assert "dropna kwarg is not supported" not in str(r.message)
def test_dissolve_multi_agg(nybb_polydf, merged_shapes):
merged_shapes[("BoroCode", "min")] = [3, 1]
merged_shapes[("BoroCode", "max")] = [5, 2]
merged_shapes[("BoroName", "count")] = [3, 2]
with warnings.catch_warnings(record=True) as record:
test = nybb_polydf.dissolve(
by="manhattan_bronx",
aggfunc={
"BoroCode": ["min", "max"],
"BoroName": "count",
},
)
assert_geodataframe_equal(test, merged_shapes)
assert len(record) == 0
def test_coverage_dissolve(nybb_polydf):
manhattan_bronx = nybb_polydf.loc[3:4]
others = nybb_polydf.loc[0:2]
collapsed = [
others.geometry.union_all(method="coverage"),
manhattan_bronx.geometry.union_all(method="coverage"),
]
merged_shapes = GeoDataFrame(
{"myshapes": collapsed},
geometry="myshapes",
index=pd.Index([5, 6], name="manhattan_bronx"),
crs=nybb_polydf.crs,
)
merged_shapes["BoroName"] = ["Staten Island", "Manhattan"]
merged_shapes["BoroCode"] = [5, 1]
test = nybb_polydf.dissolve("manhattan_bronx", method="coverage")
assert_frame_equal(merged_shapes, test, check_column_type=False)

View File

@@ -1,648 +0,0 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite (by inheriting the pandas test suite), and should
contain no other tests.
Other tests (eg related to the spatial functionality or integration
with GeoSeries/GeoDataFrame) should be added to test_array.py and others.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
A set of fixtures are defined to provide data for the tests (the fixtures
expected to be available to pytest by the inherited pandas tests).
"""
import itertools
import operator
import numpy as np
import pandas as pd
from pandas.tests.extension import base as extension_tests
import shapely.geometry
from shapely.geometry import Point
from geopandas._compat import PANDAS_GE_15, PANDAS_GE_21, PANDAS_GE_22
from geopandas.array import GeometryArray, GeometryDtype, from_shapely
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal
# -----------------------------------------------------------------------------
# Compat with extension tests in older pandas versions
# -----------------------------------------------------------------------------
not_yet_implemented = pytest.mark.skip(reason="Not yet implemented")
no_minmax = pytest.mark.skip(reason="Min/max not supported")
# -----------------------------------------------------------------------------
# Required fixtures
# -----------------------------------------------------------------------------
@pytest.fixture
def dtype():
"""A fixture providing the ExtensionDtype to validate."""
return GeometryDtype()
def make_data():
a = np.empty(100, dtype=object)
a[:] = [shapely.geometry.Point(i, i) for i in range(100)]
ga = from_shapely(a)
return ga
@pytest.fixture
def data():
"""Length-100 array for this type.
* data[0] and data[1] should both be non missing
* data[0] and data[1] should not be equal
"""
return make_data()
@pytest.fixture
def data_for_twos():
"""Length-100 array in which all the elements are two."""
raise NotImplementedError
@pytest.fixture
def data_missing():
"""Length-2 array with [NA, Valid]"""
return from_shapely([None, shapely.geometry.Point(1, 1)])
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing
@pytest.fixture
def data_repeated(data):
"""
Generate many datasets.
Parameters
----------
data : fixture implementing `data`
Returns
-------
Callable[[int], Generator]:
A callable that takes a `count` argument and
returns a generator yielding `count` datasets.
"""
def gen(count):
for _ in range(count):
yield data
return gen
@pytest.fixture
def data_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
return from_shapely([Point(0, 1), Point(1, 1), Point(0, 0)])
@pytest.fixture
def data_missing_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
return from_shapely([Point(1, 2), None, Point(0, 0)])
@pytest.fixture
def na_cmp():
"""Binary operator for comparing NA values.
Should return a function of two arguments that returns
True if both arguments are (scalar) NA for your type.
By default, uses ``operator.or``
"""
return lambda x, y: x is None and y is None
@pytest.fixture
def na_value():
"""The scalar missing value for this type. Default 'None'"""
return None
@pytest.fixture
def data_for_grouping():
"""Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
return from_shapely(
[
shapely.geometry.Point(1, 1),
shapely.geometry.Point(1, 1),
None,
None,
shapely.geometry.Point(0, 0),
shapely.geometry.Point(0, 0),
shapely.geometry.Point(1, 1),
shapely.geometry.Point(2, 2),
]
)
@pytest.fixture(params=[True, False])
def box_in_series(request):
"""Whether to box the data in a Series"""
return request.param
@pytest.fixture(
params=[
lambda x: 1,
lambda x: [1] * len(x),
lambda x: pd.Series([1] * len(x)),
lambda x: x,
],
ids=["scalar", "list", "series", "object"],
)
def groupby_apply_op(request):
"""
Functions to test groupby.apply().
"""
return request.param
@pytest.fixture(params=[True, False])
def as_frame(request):
"""
Boolean fixture to support Series and Series.to_frame() comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_series(request):
"""
Boolean fixture to support arr and Series(arr) comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def use_numpy(request):
"""
Boolean fixture to support comparison testing of ExtensionDtype array
and numpy array.
"""
return request.param
@pytest.fixture(params=["ffill", "bfill"])
def fillna_method(request):
"""
Parametrized fixture giving method parameters 'ffill' and 'bfill' for
Series.fillna(method=<method>) testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_array(request):
"""
Boolean fixture to support ExtensionDtype _from_sequence method testing.
"""
return request.param
@pytest.fixture
def invalid_scalar(data):
"""
A scalar that *cannot* be held by this ExtensionArray.
The default should work for most subclasses, but is not guaranteed.
If the array can hold any item (i.e. object dtype), then use pytest.skip.
"""
return object.__new__(object)
# Fixtures defined in pandas/conftest.py that are also needed: defining them
# here instead of importing for compatibility
@pytest.fixture(
params=["sum", "max", "min", "mean", "prod", "std", "var", "median", "kurt", "skew"]
)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names
"""
return request.param
@pytest.fixture(params=["all", "any"])
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names
"""
return request.param
# only == and != are support for GeometryArray
# @pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"])
@pytest.fixture(params=["__eq__", "__ne__"])
def all_compare_operators(request):
"""
Fixture for dunder names for common compare operations
* >=
* >
* ==
* !=
* <
* <=
"""
return request.param
@pytest.fixture(params=[None, lambda x: x])
def sort_by_key(request):
"""
Simple fixture for testing keys in sorting methods.
Tests None (no key) and the identity key.
"""
return request.param
# -----------------------------------------------------------------------------
# Inherited tests
# -----------------------------------------------------------------------------
class TestDtype(extension_tests.BaseDtypeTests):
# additional tests
def test_array_type_with_arg(self, data, dtype):
assert dtype.construct_array_type() is GeometryArray
def test_registry(self, data, dtype):
s = pd.Series(np.asarray(data), dtype=object)
result = s.astype("geometry")
assert isinstance(result.array, GeometryArray)
expected = pd.Series(data)
assert_series_equal(result, expected)
class TestInterface(extension_tests.BaseInterfaceTests):
def test_contains(self, data, data_missing):
# overridden due to the inconsistency between
# GeometryDtype.na_value = np.nan
# and None being used as NA in array
# ensure data without missing values
data = data[~data.isna()]
# first elements are non-missing
assert data[0] in data
assert data_missing[0] in data_missing
assert None in data_missing
assert None not in data
assert pd.NaT not in data_missing
class TestConstructors(extension_tests.BaseConstructorsTests):
pass
class TestReshaping(extension_tests.BaseReshapingTests):
# NOTE: this test is copied from pandas/tests/extension/base/reshaping.py
# because starting with pandas 3.0 the assert_frame_equal is strict regarding
# the exact missing value (None vs NaN)
# Our `result` uses None, but the way the `expected` is created results in
# NaNs (and specifying to use None as fill value in unstack also does not
# help)
# -> the only change compared to the upstream test is marked
@pytest.mark.parametrize(
"index",
[
# Two levels, uniform.
pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
# non-uniform
pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
# three levels, non-uniform
pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
pd.MultiIndex.from_tuples(
[
("A", "a", 1),
("A", "b", 0),
("A", "a", 0),
("B", "a", 0),
("B", "c", 1),
]
),
],
)
@pytest.mark.parametrize("obj", ["series", "frame"])
def test_unstack(self, data, index, obj):
data = data[: len(index)]
if obj == "series":
ser = pd.Series(data, index=index)
else:
ser = pd.DataFrame({"A": data, "B": data}, index=index)
n = index.nlevels
levels = list(range(n))
# [0, 1, 2]
# [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
combinations = itertools.chain.from_iterable(
itertools.permutations(levels, i) for i in range(1, n)
)
for level in combinations:
result = ser.unstack(level=level)
assert all(
isinstance(result[col].array, type(data)) for col in result.columns
)
if obj == "series":
# We should get the same result with to_frame+unstack+droplevel
df = ser.to_frame()
alt = df.unstack(level=level).droplevel(0, axis=1)
assert_frame_equal(result, alt)
obj_ser = ser.astype(object)
expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
if obj == "series":
assert (expected.dtypes == object).all()
# <------------ next line is added
expected[expected.isna()] = None
# ------------->
result = result.astype(object)
assert_frame_equal(result, expected)
class TestGetitem(extension_tests.BaseGetitemTests):
pass
class TestSetitem(extension_tests.BaseSetitemTests):
pass
class TestMissing(extension_tests.BaseMissingTests):
def test_fillna_series(self, data_missing):
fill_value = data_missing[1]
ser = pd.Series(data_missing)
# Fill with a scalar
result = ser.fillna(fill_value)
expected = pd.Series(data_missing._from_sequence([fill_value, fill_value]))
assert_series_equal(result, expected)
# Fill with a series
filler = pd.Series(
from_shapely(
[
shapely.geometry.Point(1, 1),
shapely.geometry.Point(2, 2),
],
)
)
result = ser.fillna(filler)
expected = pd.Series(data_missing._from_sequence([fill_value, fill_value]))
assert_series_equal(result, expected)
# Fill with a series not affecting the missing values
filler = pd.Series(
from_shapely(
[
shapely.geometry.Point(2, 2),
shapely.geometry.Point(1, 1),
]
),
index=[10, 11],
)
result = ser.fillna(filler)
assert_series_equal(result, ser)
# More `GeoSeries.fillna` testcases are in
# `geopandas\tests\test_pandas_methods.py::test_fillna_scalar`
# and `geopandas\tests\test_pandas_methods.py::test_fillna_series`.
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_limit_pad(self, data_missing):
super().test_fillna_limit_pad(data_missing)
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_limit_backfill(self, data_missing):
super().test_fillna_limit_backfill(data_missing)
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_series_method(self, data_missing, fillna_method):
super().test_fillna_series_method(data_missing, fillna_method)
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_no_op_returns_copy(self, data):
super().test_fillna_no_op_returns_copy(data)
if PANDAS_GE_22:
from pandas.tests.extension.base import BaseReduceTests
else:
from pandas.tests.extension.base import BaseNoReduceTests as BaseReduceTests
class TestReduce(BaseReduceTests):
@pytest.mark.skip("boolean reduce (any/all) tested in test_pandas_methods")
def test_reduce_series_boolean(self):
pass
_all_arithmetic_operators = [
"__add__",
"__radd__",
# '__sub__', '__rsub__',
"__mul__",
"__rmul__",
"__floordiv__",
"__rfloordiv__",
"__truediv__",
"__rtruediv__",
"__pow__",
"__rpow__",
"__mod__",
"__rmod__",
]
@pytest.fixture(params=_all_arithmetic_operators)
def all_arithmetic_operators(request):
"""
Fixture for dunder names for common arithmetic operations
Adapted to exclude __sub__, as this is implemented as "difference".
"""
return request.param
# an inherited test from pandas creates a Series from a list of geometries, which
# triggers the warning from Shapely, out of control of GeoPandas, so ignoring here
@pytest.mark.filterwarnings(
"ignore:The array interface is deprecated and will no longer work in Shapely 2.0"
)
class TestArithmeticOps(extension_tests.BaseArithmeticOpsTests):
@pytest.mark.skip(reason="not applicable")
def test_divmod_series_array(self, data, data_for_twos):
pass
@pytest.mark.skip(reason="not applicable")
def test_add_series_with_extension_array(self, data):
pass
# an inherited test from pandas creates a Series from a list of geometries, which
# triggers the warning from Shapely, out of control of GeoPandas, so ignoring here
@pytest.mark.filterwarnings(
"ignore:The array interface is deprecated and will no longer work in Shapely 2.0"
)
class TestComparisonOps(extension_tests.BaseComparisonOpsTests):
def _compare_other(self, s, data, op_name, other):
op = getattr(operator, op_name.strip("_"))
result = op(s, other)
expected = s.combine(other, op)
assert_series_equal(result, expected)
def test_compare_scalar(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
self._compare_other(s, data, op_name, data[0])
def test_compare_array(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
other = pd.Series([data[0]] * len(data))
self._compare_other(s, data, op_name, other)
class TestMethods(extension_tests.BaseMethodsTests):
@pytest.mark.skipif(
not PANDAS_GE_15, reason="sorting index not yet working with older pandas"
)
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna):
pass
@pytest.mark.skipif(
not PANDAS_GE_15, reason="sorting index not yet working with older pandas"
)
def test_value_counts_with_normalize(self, data):
pass
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending):
super().test_sort_values_frame(data_for_sorting, ascending)
@pytest.mark.skip(reason="searchsorted not supported")
def test_searchsorted(self, data_for_sorting, as_series):
pass
@not_yet_implemented
def test_combine_le(self):
pass
@pytest.mark.skip(reason="addition not supported")
def test_combine_add(self):
pass
@not_yet_implemented
def test_fillna_length_mismatch(self, data_missing):
msg = "Length of 'value' does not match."
with pytest.raises(ValueError, match=msg):
data_missing.fillna(data_missing.take([1]))
@no_minmax
def test_argmin_argmax(self):
pass
@no_minmax
def test_argmin_argmax_empty_array(self):
pass
@no_minmax
def test_argmin_argmax_all_na(self):
pass
@no_minmax
def test_argreduce_series(self):
pass
@no_minmax
def test_argmax_argmin_no_skipna_notimplemented(self):
pass
class TestCasting(extension_tests.BaseCastingTests):
pass
class TestGroupby(extension_tests.BaseGroupbyTests):
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super().test_groupby_extension_agg(as_index, data_for_grouping)
def test_groupby_extension_transform(self, data_for_grouping):
super().test_groupby_extension_transform(data_for_grouping)
@pytest.mark.parametrize(
"op",
[
lambda x: 1,
lambda x: [1] * len(x),
lambda x: pd.Series([1] * len(x)),
lambda x: x,
],
ids=["scalar", "list", "series", "object"],
)
def test_groupby_extension_apply(self, data_for_grouping, op):
super().test_groupby_extension_apply(data_for_grouping, op)
class TestPrinting(extension_tests.BasePrintingTests):
pass
@not_yet_implemented
class TestParsing(extension_tests.BaseParsingTests):
pass

View File

@@ -1,170 +0,0 @@
import pandas as pd
from shapely.geometry import Point
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ
from geopandas.tools import geocode, reverse_geocode
from geopandas.tools.geocoding import _prepare_geocode_result
import pytest
from geopandas.testing import assert_geodataframe_equal
from geopandas.tests.util import assert_geoseries_equal, mock
from pandas.testing import assert_series_equal
geopy = pytest.importorskip("geopy")
class ForwardMock(mock.MagicMock):
"""
Mock the forward geocoding function.
Returns the passed in address and (p, p+.5) where p increases
at each call
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._n = 0.0
def __call__(self, *args, **kwargs):
self.return_value = args[0], (self._n, self._n + 0.5)
self._n += 1
return super().__call__(*args, **kwargs)
class ReverseMock(mock.MagicMock):
"""
Mock the reverse geocoding function.
Returns the passed in point and 'address{p}' where p increases
at each call
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._n = 0
def __call__(self, *args, **kwargs):
self.return_value = "address{0}".format(self._n), args[0]
self._n += 1
return super().__call__(*args, **kwargs)
@pytest.fixture
def locations():
locations = ["260 Broadway, New York, NY", "77 Massachusetts Ave, Cambridge, MA"]
return locations
@pytest.fixture
def points():
points = [Point(-71.0597732, 42.3584308), Point(-77.0365305, 38.8977332)]
return points
def test_prepare_result():
# Calls _prepare_result with sample results from the geocoder call
# loop
p0 = Point(12.3, -45.6) # Treat these as lat/lon
p1 = Point(-23.4, 56.7)
d = {"a": ("address0", p0.coords[0]), "b": ("address1", p1.coords[0])}
df = _prepare_geocode_result(d)
assert type(df) is GeoDataFrame
if HAS_PYPROJ:
assert df.crs == "EPSG:4326"
assert len(df) == 2
assert "address" in df
coords = df.loc["a"]["geometry"].coords[0]
test = p0.coords[0]
# Output from the df should be lon/lat
assert coords[0] == pytest.approx(test[1])
assert coords[1] == pytest.approx(test[0])
coords = df.loc["b"]["geometry"].coords[0]
test = p1.coords[0]
assert coords[0] == pytest.approx(test[1])
assert coords[1] == pytest.approx(test[0])
def test_prepare_result_none():
p0 = Point(12.3, -45.6) # Treat these as lat/lon
d = {"a": ("address0", p0.coords[0]), "b": (None, None)}
df = _prepare_geocode_result(d)
assert type(df) is GeoDataFrame
if HAS_PYPROJ:
assert df.crs == "EPSG:4326"
assert len(df) == 2
assert "address" in df
row = df.loc["b"]
# TODO we should probably replace this with a missing value instead of point?
assert len(row["geometry"].coords) == 0
assert row["geometry"].is_empty
assert row["address"] is None
@pytest.mark.parametrize("geocode_result", (None, (None, None)))
def test_prepare_geocode_result_when_result_is(geocode_result):
result = {0: geocode_result}
expected_output = GeoDataFrame(
{"geometry": [Point()], "address": [None]},
crs="EPSG:4326",
)
output = _prepare_geocode_result(result)
assert_geodataframe_equal(output, expected_output)
def test_bad_provider_forward():
from geopy.exc import GeocoderNotFound
with pytest.raises(GeocoderNotFound):
geocode(["cambridge, ma"], "badprovider")
def test_bad_provider_reverse():
from geopy.exc import GeocoderNotFound
with pytest.raises(GeocoderNotFound):
reverse_geocode([Point(0, 0)], "badprovider")
def test_forward(locations, points):
from geopy.geocoders import Photon
for provider in ["photon", Photon]:
with mock.patch("geopy.geocoders.Photon.geocode", ForwardMock()) as m:
g = geocode(locations, provider=provider, timeout=2)
assert len(locations) == m.call_count
n = len(locations)
assert isinstance(g, GeoDataFrame)
expected = GeoSeries(
[Point(float(x) + 0.5, float(x)) for x in range(n)], crs="EPSG:4326"
)
assert_geoseries_equal(expected, g["geometry"])
assert_series_equal(g["address"], pd.Series(locations, name="address"))
def test_reverse(locations, points):
from geopy.geocoders import Photon
for provider in ["photon", Photon]:
with mock.patch("geopy.geocoders.Photon.reverse", ReverseMock()) as m:
g = reverse_geocode(points, provider=provider, timeout=2)
assert len(points) == m.call_count
assert isinstance(g, GeoDataFrame)
expected = GeoSeries(points, crs="EPSG:4326")
assert_geoseries_equal(expected, g["geometry"])
address = pd.Series(
["address" + str(x) for x in range(len(points))], name="address"
)
assert_series_equal(g["address"], address)

Some files were not shown because too many files have changed in this diff Show More