This commit is contained in:
2025-01-26 19:24:23 -08:00
parent 32cd60e92b
commit d1dde0dbc6
4155 changed files with 29170 additions and 216373 deletions

View File

@@ -1,8 +1,8 @@
from .clip import clip
from .geocoding import geocode, reverse_geocode
from .overlay import overlay
from .sjoin import sjoin, sjoin_nearest
from .util import collect
from .clip import clip
__all__ = [
"collect",

View File

@@ -1,6 +1,7 @@
from warnings import warn
import numpy
from shapely.geometry import MultiPoint
from geopandas.array import from_shapely, points_from_xy
@@ -64,7 +65,7 @@ def _uniform_line(geom, size, generator):
"""
fracs = generator.uniform(size=size)
return from_shapely(geom.interpolate(fracs, normalized=True)).unary_union()
return from_shapely(geom.interpolate(fracs, normalized=True)).union_all()
def _uniform_polygon(geom, size, generator):
@@ -80,4 +81,4 @@ def _uniform_polygon(geom, size, generator):
)
valid_samples = batch[batch.sindex.query(geom, predicate="contains")]
candidates.extend(valid_samples)
return GeoSeries(candidates[:size]).unary_union
return GeoSeries(candidates[:size]).union_all()

View File

@@ -58,33 +58,27 @@ def _get_C_info():
geos_dir = None
try:
import fiona
import pyogrio
gdal_version = fiona.env.get_gdal_release_name()
gdal_version = pyogrio.__gdal_version_string__
gdal_dir = pyogrio.get_gdal_data_path()
except Exception:
gdal_version = None
try:
import fiona
gdal_dir = fiona.env.GDALDataFinder().search()
except Exception:
gdal_dir = None
if gdal_version is None:
try:
import pyogrio
import fiona
gdal_version = pyogrio.__gdal_version_string__
gdal_dir = None
gdal_version = fiona.env.get_gdal_release_name()
except Exception:
pass
gdal_version = None
try:
# get_gdal_data_path is only available in pyogrio >= 0.4.2
from pyogrio import get_gdal_data_path
import fiona
gdal_dir = get_gdal_data_path()
gdal_dir = fiona.env.GDALDataFinder().search()
except Exception:
pass
gdal_dir = None
blob = [
("GEOS", geos_version),
@@ -114,16 +108,15 @@ def _get_deps_info():
"pyproj",
"shapely",
# optional deps
"fiona",
"pyogrio",
"geoalchemy2",
"geopy",
"matplotlib",
"mapclassify",
"pygeos",
"pyogrio",
"fiona",
"psycopg",
"psycopg2",
"pyarrow",
"rtree",
]
def get_version(module):

View File

@@ -5,11 +5,13 @@ geopandas.clip
A module to clip vector data using GeoPandas.
"""
import warnings
import numpy as np
import pandas.api.types
from shapely.geometry import Polygon, MultiPolygon, box
from shapely.geometry import MultiPolygon, Polygon, box
from geopandas import GeoDataFrame, GeoSeries
from geopandas.array import _check_crs, _crs_mismatch_warn
@@ -21,7 +23,7 @@ def _mask_is_list_like_rectangle(mask):
)
def _clip_gdf_with_mask(gdf, mask):
def _clip_gdf_with_mask(gdf, mask, sort=False):
"""Clip geometry to the polygon/rectangle extent.
Clip an input GeoDataFrame to the polygon extent of the polygon
@@ -35,6 +37,10 @@ def _clip_gdf_with_mask(gdf, mask):
mask : (Multi)Polygon, list-like
Reference polygon/rectangle for clipping.
sort : boolean, default False
If True, the results will be sorted in ascending order using the
geometries' indexes as the primary key.
Returns
-------
GeoDataFrame
@@ -47,7 +53,9 @@ def _clip_gdf_with_mask(gdf, mask):
else:
intersection_polygon = mask
gdf_sub = gdf.iloc[gdf.sindex.query(intersection_polygon, predicate="intersects")]
gdf_sub = gdf.iloc[
gdf.sindex.query(intersection_polygon, predicate="intersects", sort=sort)
]
# For performance reasons points don't need to be intersected with poly
non_point_mask = gdf_sub.geom_type != "Point"
@@ -60,13 +68,13 @@ def _clip_gdf_with_mask(gdf, mask):
if isinstance(gdf_sub, GeoDataFrame):
clipped = gdf_sub.copy()
if clipping_by_rectangle:
clipped.loc[
non_point_mask, clipped._geometry_column_name
] = gdf_sub.geometry.values[non_point_mask].clip_by_rect(*mask)
clipped.loc[non_point_mask, clipped._geometry_column_name] = (
gdf_sub.geometry.values[non_point_mask].clip_by_rect(*mask)
)
else:
clipped.loc[
non_point_mask, clipped._geometry_column_name
] = gdf_sub.geometry.values[non_point_mask].intersection(mask)
clipped.loc[non_point_mask, clipped._geometry_column_name] = (
gdf_sub.geometry.values[non_point_mask].intersection(mask)
)
else:
# GeoSeries
clipped = gdf_sub.copy()
@@ -81,7 +89,7 @@ def _clip_gdf_with_mask(gdf, mask):
return clipped
def clip(gdf, mask, keep_geom_type=False):
def clip(gdf, mask, keep_geom_type=False, sort=False):
"""Clip points, lines, or polygon geometries to the mask extent.
Both layers must be in the same Coordinate Reference System (CRS).
@@ -112,6 +120,9 @@ def clip(gdf, mask, keep_geom_type=False):
If True, return only geometries of original type in case of intersection
resulting in multiple geometry types or GeometryCollections.
If False, return all resulting geometries (potentially mixed-types).
sort : boolean, default False
If True, the results will be sorted in ascending order using the
geometries' indexes as the primary key.
Returns
-------
@@ -185,11 +196,11 @@ def clip(gdf, mask, keep_geom_type=False):
return gdf.iloc[:0]
if isinstance(mask, (GeoDataFrame, GeoSeries)):
combined_mask = mask.geometry.unary_union
combined_mask = mask.geometry.union_all()
else:
combined_mask = mask
clipped = _clip_gdf_with_mask(gdf, combined_mask)
clipped = _clip_gdf_with_mask(gdf, combined_mask, sort=sort)
if keep_geom_type:
geomcoll_concat = (clipped.geom_type == "GeometryCollection").any()

View File

@@ -1,5 +1,5 @@
from collections import defaultdict
import time
from collections import defaultdict
import pandas as pd
@@ -120,8 +120,8 @@ def reverse_geocode(points, provider=None, **kwargs):
def _query(data, forward, provider, throttle_time, **kwargs):
# generic wrapper for calls over lists to geopy Geocoders
from geopy.geocoders.base import GeocoderQueryError
from geopy.geocoders import get_geocoder_for_service
from geopy.geocoders.base import GeocoderQueryError
if forward:
if not isinstance(data, pd.Series):

View File

@@ -4,8 +4,8 @@ from functools import reduce
import numpy as np
import pandas as pd
from geopandas import _compat as compat
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import PANDAS_GE_30
from geopandas.array import _check_crs, _crs_mismatch_warn
@@ -15,12 +15,15 @@ def _ensure_geometry_column(df):
If another column with that name exists, it will be dropped.
"""
if not df._geometry_column_name == "geometry":
if "geometry" in df.columns:
df.drop("geometry", axis=1, inplace=True)
df.rename(
columns={df._geometry_column_name: "geometry"}, copy=False, inplace=True
)
df.set_geometry("geometry", inplace=True)
if PANDAS_GE_30:
if "geometry" in df.columns:
df = df.drop("geometry", axis=1)
df = df.rename_geometry("geometry")
else:
if "geometry" in df.columns:
df.drop("geometry", axis=1, inplace=True)
df.rename_geometry("geometry", inplace=True)
return df
def _overlay_intersection(df1, df2):
@@ -37,7 +40,7 @@ def _overlay_intersection(df1, df2):
right.reset_index(drop=True, inplace=True)
intersections = left.intersection(right)
poly_ix = intersections.geom_type.isin(["Polygon", "MultiPolygon"])
intersections.loc[poly_ix] = intersections[poly_ix].buffer(0)
intersections.loc[poly_ix] = intersections[poly_ix].make_valid()
# only keep actual intersecting geometries
pairs_intersect = pd.DataFrame({"__idx1": idx1, "__idx2": idx2})
@@ -66,8 +69,8 @@ def _overlay_intersection(df1, df2):
right_index=True,
suffixes=("_1", "_2"),
)
result["__idx1"] = None
result["__idx2"] = None
result["__idx1"] = np.nan
result["__idx2"] = np.nan
return result[
result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name]
]
@@ -94,10 +97,7 @@ def _overlay_difference(df1, df2):
new_g.append(new)
differences = GeoSeries(new_g, index=df1.index, crs=df1.crs)
poly_ix = differences.geom_type.isin(["Polygon", "MultiPolygon"])
if compat.USE_PYGEOS:
differences.loc[poly_ix] = differences[poly_ix].make_valid()
else:
differences.loc[poly_ix] = differences[poly_ix].buffer(0)
differences.loc[poly_ix] = differences[poly_ix].make_valid()
geom_diff = differences[~differences.is_empty].copy()
dfdiff = df1[~differences.is_empty].copy()
dfdiff[dfdiff._geometry_column_name] = geom_diff
@@ -115,8 +115,8 @@ def _overlay_symmetric_diff(df1, df2):
dfdiff1["__idx2"] = np.nan
dfdiff2["__idx1"] = np.nan
# ensure geometry name (otherwise merge goes wrong)
_ensure_geometry_column(dfdiff1)
_ensure_geometry_column(dfdiff2)
dfdiff1 = _ensure_geometry_column(dfdiff1)
dfdiff2 = _ensure_geometry_column(dfdiff2)
# combine both 'difference' dataframes
dfsym = dfdiff1.merge(
dfdiff2, on=["__idx1", "__idx2"], how="outer", suffixes=("_1", "_2")
@@ -170,7 +170,7 @@ def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True):
which will set keep_geom_type to True but warn upon dropping
geometries.
make_valid : bool, default True
If True, any invalid input geometries are corrected with a call to `buffer(0)`,
If True, any invalid input geometries are corrected with a call to make_valid(),
if False, a `ValueError` is raised if any input geometries are invalid.
Returns
@@ -190,40 +190,40 @@ def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True):
>>> df2 = geopandas.GeoDataFrame({'geometry': polys2, 'df2_data':[1,2]})
>>> geopandas.overlay(df1, df2, how='union')
df1_data df2_data geometry
0 1.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 1.00000, 1....
1 2.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 3.00000, 3....
2 2.0 2.0 POLYGON ((4.00000 4.00000, 4.00000 3.00000, 3....
3 1.0 NaN POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0....
4 2.0 NaN MULTIPOLYGON (((3.00000 4.00000, 3.00000 3.000...
5 NaN 1.0 MULTIPOLYGON (((2.00000 3.00000, 2.00000 2.000...
6 NaN 2.0 POLYGON ((3.00000 5.00000, 5.00000 5.00000, 5....
df1_data df2_data geometry
0 1.0 1.0 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
1 2.0 1.0 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
2 2.0 2.0 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
3 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
4 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
5 NaN 1.0 MULTIPOLYGON (((2 3, 2 2, 1 2, 1 3, 2 3)), ((3...
6 NaN 2.0 POLYGON ((3 5, 5 5, 5 3, 4 3, 4 4, 3 4, 3 5))
>>> geopandas.overlay(df1, df2, how='intersection')
df1_data df2_data geometry
0 1 1 POLYGON ((2.00000 2.00000, 2.00000 1.00000, 1....
1 2 1 POLYGON ((2.00000 2.00000, 2.00000 3.00000, 3....
2 2 2 POLYGON ((4.00000 4.00000, 4.00000 3.00000, 3....
df1_data df2_data geometry
0 1 1 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
1 2 1 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
2 2 2 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
>>> geopandas.overlay(df1, df2, how='symmetric_difference')
df1_data df2_data geometry
0 1.0 NaN POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0....
1 2.0 NaN MULTIPOLYGON (((3.00000 4.00000, 3.00000 3.000...
2 NaN 1.0 MULTIPOLYGON (((2.00000 3.00000, 2.00000 2.000...
3 NaN 2.0 POLYGON ((3.00000 5.00000, 5.00000 5.00000, 5....
df1_data df2_data geometry
0 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
1 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
2 NaN 1.0 MULTIPOLYGON (((2 3, 2 2, 1 2, 1 3, 2 3)), ((3...
3 NaN 2.0 POLYGON ((3 5, 5 5, 5 3, 4 3, 4 4, 3 4, 3 5))
>>> geopandas.overlay(df1, df2, how='difference')
geometry df1_data
0 POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0.... 1
1 MULTIPOLYGON (((3.00000 4.00000, 3.00000 3.000... 2
geometry df1_data
0 POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0)) 1
1 MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4... 2
>>> geopandas.overlay(df1, df2, how='identity')
df1_data df2_data geometry
0 1.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 1.00000, 1....
1 2.0 1.0 POLYGON ((2.00000 2.00000, 2.00000 3.00000, 3....
2 2.0 2.0 POLYGON ((4.00000 4.00000, 4.00000 3.00000, 3....
3 1.0 NaN POLYGON ((2.00000 0.00000, 0.00000 0.00000, 0....
4 2.0 NaN MULTIPOLYGON (((3.00000 4.00000, 3.00000 3.000...
0 1.0 1.0 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
1 2.0 1.0 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
2 2.0 2.0 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
3 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
4 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
See also
--------
@@ -300,7 +300,7 @@ def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True):
mask = ~df.geometry.is_valid
col = df._geometry_column_name
if make_valid:
df.loc[mask, col] = df.loc[mask, col].buffer(0)
df.loc[mask, col] = df.loc[mask, col].make_valid()
elif mask.any():
raise ValueError(
"You have passed make_valid=False along with "
@@ -362,7 +362,7 @@ def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True):
# level_0 created with above reset_index operation
# and represents the original geometry collections
# TODO avoiding dissolve to call unary_union in this case could further
# TODO avoiding dissolve to call union_all in this case could further
# improve performance (we only need to collect geometries in their
# respective Multi version)
dissolved = exploded.dissolve(by="level_0")

View File

@@ -1,11 +1,12 @@
from typing import Optional
import warnings
from functools import partial
from typing import Optional
import numpy as np
import pandas as pd
from geopandas import GeoDataFrame
from geopandas import _compat as compat
from geopandas._compat import PANDAS_GE_30
from geopandas.array import _check_crs, _crs_mismatch_warn
@@ -16,6 +17,8 @@ def sjoin(
predicate="intersects",
lsuffix="left",
rsuffix="right",
distance=None,
on_attribute=None,
**kwargs,
):
"""Spatial join of two GeoDataFrames.
@@ -43,6 +46,16 @@ def sjoin(
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string, default 'right'
Suffix to apply to overlapping column names (right GeoDataFrame).
distance : number or array_like, optional
Distance(s) around each input geometry within which to query the tree
for the 'dwithin' predicate. If array_like, must be
one-dimesional with length equal to length of left GeoDataFrame.
Required if ``predicate='dwithin'``.
on_attribute : string, list or tuple
Column name(s) to join on as an additional join restriction on top
of the spatial predicate. These must be found in both DataFrames.
If set, observations are joined only if the predicate applies
and values in specified columns match.
Examples
--------
@@ -74,12 +87,12 @@ def sjoin(
>>> groceries_w_communities = geopandas.sjoin(groceries, chicago)
>>> groceries_w_communities.head() # doctest: +SKIP
OBJECTID Ycoord Xcoord ... GonorrF GonorrM Tuberc
0 16 41.973266 -87.657073 ... 170.8 468.7 13.6
87 365 41.961707 -87.654058 ... 170.8 468.7 13.6
90 373 41.963131 -87.656352 ... 170.8 468.7 13.6
140 582 41.969131 -87.674882 ... 170.8 468.7 13.6
1 18 41.696367 -87.681315 ... 800.5 741.1 2.6
OBJECTID community geometry
0 16 UPTOWN MULTIPOINT ((-87.65661 41.97321))
1 18 MORGAN PARK MULTIPOINT ((-87.68136 41.69713))
2 22 NEAR WEST SIDE MULTIPOINT ((-87.63918 41.86847))
3 23 NEAR WEST SIDE MULTIPOINT ((-87.65495 41.87783))
4 27 CHATHAM MULTIPOINT ((-87.62715 41.73623))
[5 rows x 95 columns]
See also
@@ -92,40 +105,42 @@ def sjoin(
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
if "op" in kwargs:
op = kwargs.pop("op")
deprecation_message = (
"The `op` parameter is deprecated and will be removed"
" in a future release. Please use the `predicate` parameter"
" instead."
)
if predicate != "intersects" and op != predicate:
override_message = (
"A non-default value for `predicate` was passed"
f' (got `predicate="{predicate}"`'
f' in combination with `op="{op}"`).'
" The value of `predicate` will be overridden by the value of `op`,"
" , which may result in unexpected behavior."
f"\n{deprecation_message}"
)
warnings.warn(override_message, UserWarning, stacklevel=4)
else:
warnings.warn(deprecation_message, FutureWarning, stacklevel=4)
predicate = op
if kwargs:
first = next(iter(kwargs.keys()))
raise TypeError(f"sjoin() got an unexpected keyword argument '{first}'")
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
on_attribute = _maybe_make_list(on_attribute)
indices = _geom_predicate_query(left_df, right_df, predicate)
_basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=on_attribute),
joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
indices = _geom_predicate_query(
left_df, right_df, predicate, distance, on_attribute=on_attribute
)
joined, _ = _frame_join(
left_df,
right_df,
indices,
None,
how,
lsuffix,
rsuffix,
predicate,
on_attribute=on_attribute,
)
return joined
def _basic_checks(left_df, right_df, how, lsuffix, rsuffix):
def _maybe_make_list(obj):
if isinstance(obj, tuple):
return list(obj)
if obj is not None and not isinstance(obj, list):
return [obj]
return obj
def _basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=None):
"""Checks the validity of join input parameters.
`how` must be one of the valid options.
@@ -142,6 +157,8 @@ def _basic_checks(left_df, right_df, how, lsuffix, rsuffix):
left index suffix
rsuffix : str
right index suffix
on_attribute : list, default None
list of column names to merge on along with geometry
"""
if not isinstance(left_df, GeoDataFrame):
raise ValueError(
@@ -162,20 +179,28 @@ def _basic_checks(left_df, right_df, how, lsuffix, rsuffix):
if not _check_crs(left_df, right_df):
_crs_mismatch_warn(left_df, right_df, stacklevel=4)
index_left = "index_{}".format(lsuffix)
index_right = "index_{}".format(rsuffix)
# due to GH 352
if any(left_df.columns.isin([index_left, index_right])) or any(
right_df.columns.isin([index_left, index_right])
):
raise ValueError(
"'{0}' and '{1}' cannot be names in the frames being"
" joined".format(index_left, index_right)
)
if on_attribute:
for attr in on_attribute:
if (attr not in left_df) and (attr not in right_df):
raise ValueError(
f"Expected column {attr} is missing from both of the dataframes."
)
if attr not in left_df:
raise ValueError(
f"Expected column {attr} is missing from the left dataframe."
)
if attr not in right_df:
raise ValueError(
f"Expected column {attr} is missing from the right dataframe."
)
if attr in (left_df.geometry.name, right_df.geometry.name):
raise ValueError(
"Active geometry column cannot be used as an input "
"for on_attribute parameter."
)
def _geom_predicate_query(left_df, right_df, predicate):
def _geom_predicate_query(left_df, right_df, predicate, distance, on_attribute=None):
"""Compute geometric comparisons and get matching indices.
Parameters
@@ -184,6 +209,9 @@ def _geom_predicate_query(left_df, right_df, predicate):
right_df : GeoDataFrame
predicate : string
Binary predicate to query.
on_attribute: list, default None
list of column names to merge on along with geometry
Returns
-------
@@ -191,163 +219,292 @@ def _geom_predicate_query(left_df, right_df, predicate):
DataFrame with matching indices in
columns named `_key_left` and `_key_right`.
"""
with warnings.catch_warnings():
# We don't need to show our own warning here
# TODO remove this once the deprecation has been enforced
warnings.filterwarnings(
"ignore", "Generated spatial index is empty", FutureWarning
)
original_predicate = predicate
original_predicate = predicate
if predicate == "within":
# within is implemented as the inverse of contains
# contains is a faster predicate
# see discussion at https://github.com/geopandas/geopandas/pull/1421
predicate = "contains"
sindex = left_df.sindex
input_geoms = right_df.geometry
else:
# all other predicates are symmetric
# keep them the same
sindex = right_df.sindex
input_geoms = left_df.geometry
if predicate == "within":
# within is implemented as the inverse of contains
# contains is a faster predicate
# see discussion at https://github.com/geopandas/geopandas/pull/1421
predicate = "contains"
sindex = left_df.sindex
input_geoms = right_df.geometry
else:
# all other predicates are symmetric
# keep them the same
sindex = right_df.sindex
input_geoms = left_df.geometry
if sindex:
l_idx, r_idx = sindex.query(input_geoms, predicate=predicate, sort=False)
indices = pd.DataFrame({"_key_left": l_idx, "_key_right": r_idx})
l_idx, r_idx = sindex.query(
input_geoms, predicate=predicate, sort=False, distance=distance
)
else:
# when sindex is empty / has no valid geometries
indices = pd.DataFrame(columns=["_key_left", "_key_right"], dtype=float)
l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
if original_predicate == "within":
# within is implemented as the inverse of contains
# flip back the results
indices = indices.rename(
columns={"_key_left": "_key_right", "_key_right": "_key_left"}
r_idx, l_idx = l_idx, r_idx
indexer = np.lexsort((r_idx, l_idx))
l_idx = l_idx[indexer]
r_idx = r_idx[indexer]
if on_attribute:
for attr in on_attribute:
(l_idx, r_idx), _ = _filter_shared_attribute(
left_df, right_df, l_idx, r_idx, attr
)
return l_idx, r_idx
def _reset_index_with_suffix(df, suffix, other):
"""
Equivalent of df.reset_index(), but with adding 'suffix' to auto-generated
column names.
"""
index_original = df.index.names
if PANDAS_GE_30:
df_reset = df.reset_index()
else:
# we already made a copy of the dataframe in _frame_join before getting here
df_reset = df
df_reset.reset_index(inplace=True)
column_names = df_reset.columns.to_numpy(copy=True)
for i, label in enumerate(index_original):
# if the original label was None, add suffix to auto-generated name
if label is None:
new_label = column_names[i]
if "level" in new_label:
# reset_index of MultiIndex gives "level_i" names, preserve the "i"
lev = new_label.split("_")[1]
new_label = f"index_{suffix}{lev}"
else:
new_label = f"index_{suffix}"
# check new label will not be in other dataframe
if new_label in df.columns or new_label in other.columns:
raise ValueError(
"'{0}' cannot be a column name in the frames being"
" joined".format(new_label)
)
column_names[i] = new_label
return df_reset, pd.Index(column_names)
def _process_column_names_with_suffix(
left: pd.Index, right: pd.Index, suffixes, left_df, right_df
):
"""
Add suffixes to overlapping labels (ignoring the geometry column).
This is based on pandas' merge logic at https://github.com/pandas-dev/pandas/blob/
a0779adb183345a8eb4be58b3ad00c223da58768/pandas/core/reshape/merge.py#L2300-L2370
"""
to_rename = left.intersection(right)
if len(to_rename) == 0:
return left, right
lsuffix, rsuffix = suffixes
if not lsuffix and not rsuffix:
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
def renamer(x, suffix, geometry):
if x in to_rename and x != geometry and suffix is not None:
return f"{x}_{suffix}"
return x
lrenamer = partial(
renamer,
suffix=lsuffix,
geometry=getattr(left_df, "_geometry_column_name", None),
)
rrenamer = partial(
renamer,
suffix=rsuffix,
geometry=getattr(right_df, "_geometry_column_name", None),
)
# TODO retain index name?
left_renamed = pd.Index([lrenamer(lab) for lab in left])
right_renamed = pd.Index([rrenamer(lab) for lab in right])
dups = []
if not left_renamed.is_unique:
# Only warn when duplicates are caused because of suffixes, already duplicated
# columns in origin should not warn
dups = left_renamed[(left_renamed.duplicated()) & (~left.duplicated())].tolist()
if not right_renamed.is_unique:
dups.extend(
right_renamed[(right_renamed.duplicated()) & (~right.duplicated())].tolist()
)
# TODO turn this into an error (pandas has done so as well)
if dups:
warnings.warn(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
f"result is deprecated and will raise a MergeError in a future version.",
FutureWarning,
stacklevel=4,
)
return indices
return left_renamed, right_renamed
def _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix):
def _restore_index(joined, index_names, index_names_original):
"""
Set back the the original index columns, and restoring their name as `None`
if they didn't have a name originally.
"""
if PANDAS_GE_30:
joined = joined.set_index(list(index_names))
else:
joined.set_index(list(index_names), inplace=True)
# restore the fact that the index didn't have a name
joined_index_names = list(joined.index.names)
for i, label in enumerate(index_names_original):
if label is None:
joined_index_names[i] = None
joined.index.names = joined_index_names
return joined
def _adjust_indexers(indices, distances, original_length, how, predicate):
"""
The left/right indexers from the query represents an inner join.
For a left or right join, we need to adjust them to include the rows
that would not be present in an inner join.
"""
# the indices represent an inner join, no adjustment needed
if how == "inner":
return indices, distances
l_idx, r_idx = indices
if how == "right":
# re-sort so it is sorted by the right indexer
indexer = np.lexsort((l_idx, r_idx))
l_idx, r_idx = l_idx[indexer], r_idx[indexer]
if distances is not None:
distances = distances[indexer]
# switch order
r_idx, l_idx = l_idx, r_idx
# determine which indices are missing and where they would need to be inserted
idx = np.arange(original_length)
l_idx_missing = idx[~np.isin(idx, l_idx)]
insert_idx = np.searchsorted(l_idx, l_idx_missing)
# for the left indexer, insert those missing indices
l_idx = np.insert(l_idx, insert_idx, l_idx_missing)
# for the right indexer, insert -1 -> to get missing values in pandas' reindexing
r_idx = np.insert(r_idx, insert_idx, -1)
# for the indices, already insert those missing values manually
if distances is not None:
distances = np.insert(distances, insert_idx, np.nan)
if how == "right":
# switch back
l_idx, r_idx = r_idx, l_idx
return (l_idx, r_idx), distances
def _frame_join(
left_df,
right_df,
indices,
distances,
how,
lsuffix,
rsuffix,
predicate,
on_attribute=None,
):
"""Join the GeoDataFrames at the DataFrame level.
Parameters
----------
join_df : DataFrame
Indices and join data returned by the geometric join.
Must have columns `_key_left` and `_key_right`
with integer indices representing the matches
from `left_df` and `right_df` respectively.
Additional columns may be included and will be copied to
the resultant GeoDataFrame.
left_df : GeoDataFrame
right_df : GeoDataFrame
indices : tuple of ndarray
Indices returned by the geometric join. Tuple with with integer
indices representing the matches from `left_df` and `right_df`
respectively.
distances : ndarray, optional
Passed trough and adapted based on the indices, if needed.
how : string
The type of join to use on the DataFrame level.
lsuffix : string
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string
Suffix to apply to overlapping column names (right GeoDataFrame).
how : string
The type of join to use on the DataFrame level.
on_attribute: list, default None
list of column names to merge on along with geometry
Returns
-------
GeoDataFrame
Joined GeoDataFrame.
"""
# the spatial index only allows limited (numeric) index types, but an
# index in geopandas may be any arbitrary dtype. so reset both indices now
# and store references to the original indices, to be reaffixed later.
# GH 352
index_left = "index_{}".format(lsuffix)
left_df = left_df.copy(deep=True)
try:
left_index_name = left_df.index.name
left_df.index = left_df.index.rename(index_left)
except TypeError:
index_left = [
"index_{}".format(lsuffix + str(pos))
for pos, ix in enumerate(left_df.index.names)
]
left_index_name = left_df.index.names
left_df.index = left_df.index.rename(index_left)
left_df = left_df.reset_index()
if on_attribute: # avoid renaming or duplicating shared column
right_df = right_df.drop(on_attribute, axis=1)
index_right = "index_{}".format(rsuffix)
right_df = right_df.copy(deep=True)
try:
right_index_name = right_df.index.name
right_df.index = right_df.index.rename(index_right)
except TypeError:
index_right = [
"index_{}".format(rsuffix + str(pos))
for pos, ix in enumerate(right_df.index.names)
]
right_index_name = right_df.index.names
right_df.index = right_df.index.rename(index_right)
right_df = right_df.reset_index()
if how in ("inner", "left"):
right_df = right_df.drop(right_df.geometry.name, axis=1)
else: # how == 'right':
left_df = left_df.drop(left_df.geometry.name, axis=1)
left_df = left_df.copy(deep=False)
left_nlevels = left_df.index.nlevels
left_index_original = left_df.index.names
left_df, left_column_names = _reset_index_with_suffix(left_df, lsuffix, right_df)
right_df = right_df.copy(deep=False)
right_nlevels = right_df.index.nlevels
right_index_original = right_df.index.names
right_df, right_column_names = _reset_index_with_suffix(right_df, rsuffix, left_df)
# if conflicting names in left and right, add suffix
left_column_names, right_column_names = _process_column_names_with_suffix(
left_column_names,
right_column_names,
(lsuffix, rsuffix),
left_df,
right_df,
)
left_df.columns = left_column_names
right_df.columns = right_column_names
left_index = left_df.columns[:left_nlevels]
right_index = right_df.columns[:right_nlevels]
# perform join on the dataframes
if how == "inner":
join_df = join_df.set_index("_key_left")
joined = (
left_df.merge(join_df, left_index=True, right_index=True)
.merge(
right_df.drop(right_df.geometry.name, axis=1),
left_on="_key_right",
right_index=True,
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
)
.set_index(index_left)
.drop(["_key_right"], axis=1)
)
if isinstance(index_left, list):
joined.index.names = left_index_name
else:
joined.index.name = left_index_name
elif how == "left":
join_df = join_df.set_index("_key_left")
joined = (
left_df.merge(join_df, left_index=True, right_index=True, how="left")
.merge(
right_df.drop(right_df.geometry.name, axis=1),
how="left",
left_on="_key_right",
right_index=True,
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
)
.set_index(index_left)
.drop(["_key_right"], axis=1)
)
if isinstance(index_left, list):
joined.index.names = left_index_name
else:
joined.index.name = left_index_name
original_length = len(right_df) if how == "right" else len(left_df)
(l_idx, r_idx), distances = _adjust_indexers(
indices, distances, original_length, how, predicate
)
# the `take` method doesn't allow introducing NaNs with -1 indices
# left = left_df.take(l_idx)
# therefore we are using the private _reindex_with_indexers as workaround
new_index = pd.RangeIndex(len(l_idx))
left = left_df._reindex_with_indexers({0: (new_index, l_idx)})
right = right_df._reindex_with_indexers({0: (new_index, r_idx)})
if PANDAS_GE_30:
kwargs = {}
else:
kwargs = dict(copy=False)
joined = pd.concat([left, right], axis=1, **kwargs)
if how in ("inner", "left"):
joined = _restore_index(joined, left_index, left_index_original)
else: # how == 'right':
joined = (
left_df.drop(left_df.geometry.name, axis=1)
.merge(
join_df.merge(
right_df, left_on="_key_right", right_index=True, how="right"
),
left_index=True,
right_on="_key_left",
how="right",
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
)
.set_index(index_right)
.drop(["_key_left", "_key_right"], axis=1)
.set_geometry(right_df.geometry.name)
)
if isinstance(index_right, list):
joined.index.names = right_index_name
else:
joined.index.name = right_index_name
joined = joined.set_geometry(right_df.geometry.name)
joined = _restore_index(joined, right_index, right_index_original)
return joined
return joined, distances
def _nearest_query(
@@ -357,13 +514,8 @@ def _nearest_query(
how: str,
return_distance: bool,
exclusive: bool,
on_attribute: Optional[list] = None,
):
if not (compat.USE_SHAPELY_20 or (compat.USE_PYGEOS and compat.PYGEOS_GE_010)):
raise NotImplementedError(
"Currently, only PyGEOS >= 0.10.0 or Shapely >= 2.0 supports "
"`nearest_all`. " + compat.INSTALL_PYGEOS_ERROR
)
# use the opposite of the join direction for the index
use_left_as_sindex = how == "right"
if use_left_as_sindex:
@@ -393,15 +545,37 @@ def _nearest_query(
distances = distances[sort_order]
else:
l_idx, r_idx = input_idx, tree_idx
join_df = pd.DataFrame(
{"_key_left": l_idx, "_key_right": r_idx, "distances": distances}
)
else:
# when sindex is empty / has no valid geometries
join_df = pd.DataFrame(
columns=["_key_left", "_key_right", "distances"], dtype=float
)
return join_df
l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
if return_distance:
distances = np.array([], dtype=np.float64)
else:
distances = None
if on_attribute:
for attr in on_attribute:
(l_idx, r_idx), shared_attribute_rows = _filter_shared_attribute(
left_df, right_df, l_idx, r_idx, attr
)
distances = distances[shared_attribute_rows]
return (l_idx, r_idx), distances
def _filter_shared_attribute(left_df, right_df, l_idx, r_idx, attribute):
"""
Returns the indices for the left and right dataframe that share the same entry
in the attribute column. Also returns a Boolean `shared_attribute_rows` for rows
with the same entry.
"""
shared_attribute_rows = (
left_df[attribute].iloc[l_idx].values == right_df[attribute].iloc[r_idx].values
)
l_idx = l_idx[shared_attribute_rows]
r_idx = r_idx[shared_attribute_rows]
return (l_idx, r_idx), shared_attribute_rows
def sjoin_nearest(
@@ -453,7 +627,6 @@ def sjoin_nearest(
exclusive : bool, default False
If True, the nearest geometries that are equal to the input geometry
will not be returned, default False.
Requires Shapely >= 2.0.
Examples
--------
@@ -466,7 +639,7 @@ def sjoin_nearest(
... ).to_crs(groceries.crs)
>>> chicago.head() # doctest: +SKIP
ComAreaID ... geometry
ComAreaID ... geometry
0 35 ... POLYGON ((-87.60914 41.84469, -87.60915 41.844...
1 36 ... POLYGON ((-87.59215 41.81693, -87.59231 41.816...
2 37 ... POLYGON ((-87.62880 41.80189, -87.62879 41.801...
@@ -475,19 +648,19 @@ def sjoin_nearest(
[5 rows x 87 columns]
>>> groceries.head() # doctest: +SKIP
OBJECTID Ycoord ... Category geometry
0 16 41.973266 ... NaN MULTIPOINT (-87.65661 41.97321)
1 18 41.696367 ... NaN MULTIPOINT (-87.68136 41.69713)
2 22 41.868634 ... NaN MULTIPOINT (-87.63918 41.86847)
3 23 41.877590 ... new MULTIPOINT (-87.65495 41.87783)
4 27 41.737696 ... NaN MULTIPOINT (-87.62715 41.73623)
OBJECTID Ycoord ... Category geometry
0 16 41.973266 ... NaN MULTIPOINT ((-87.65661 41.97321))
1 18 41.696367 ... NaN MULTIPOINT ((-87.68136 41.69713))
2 22 41.868634 ... NaN MULTIPOINT ((-87.63918 41.86847))
3 23 41.877590 ... new MULTIPOINT ((-87.65495 41.87783))
4 27 41.737696 ... NaN MULTIPOINT ((-87.62715 41.73623))
[5 rows x 8 columns]
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago)
>>> groceries_w_communities[["Chain", "community", "geometry"]].head(2)
Chain community geometry
0 VIET HOA PLAZA UPTOWN MULTIPOINT (1168268.672 1933554.350)
87 JEWEL OSCO UPTOWN MULTIPOINT (1168837.980 1929246.962)
Chain community geometry
0 VIET HOA PLAZA UPTOWN MULTIPOINT ((1168268.672 1933554.35))
1 COUNTY FAIR FOODS MORGAN PARK MULTIPOINT ((1162302.618 1832900.224))
To include the distances:
@@ -495,10 +668,10 @@ def sjoin_nearest(
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago, \
distance_col="distances")
>>> groceries_w_communities[["Chain", "community", \
"distances"]].head(2) # doctest: +SKIP
Chain community distances
0 VIET HOA PLAZA UPTOWN 0.0
87 JEWEL OSCO UPTOWN 0.0
"distances"]].head(2)
Chain community distances
0 VIET HOA PLAZA UPTOWN 0.0
1 COUNTY FAIR FOODS MORGAN PARK 0.0
In the following example, we get multiple groceries for Uptown because all
results are equidistant (in this case zero because they intersect).
@@ -508,7 +681,7 @@ distance_col="distances")
distance_col="distances", how="right")
>>> uptown_results = \
chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
>>> uptown_results[["Chain", "community"]] # doctest: +SKIP
>>> uptown_results[["Chain", "community"]]
Chain community
30 VIET HOA PLAZA UPTOWN
30 JEWEL OSCO UPTOWN
@@ -528,6 +701,7 @@ chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
left_df.geometry.values.check_geographic_crs(stacklevel=1)
@@ -535,19 +709,26 @@ chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
return_distance = distance_col is not None
join_df = _nearest_query(
left_df, right_df, max_distance, how, return_distance, exclusive
indices, distances = _nearest_query(
left_df,
right_df,
max_distance,
how,
return_distance,
exclusive,
)
joined, distances = _frame_join(
left_df,
right_df,
indices,
distances,
how,
lsuffix,
rsuffix,
None,
)
if return_distance:
join_df = join_df.rename(columns={"distances": distance_col})
else:
join_df.pop("distances")
joined = _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix)
if return_distance:
columns = [c for c in joined.columns if c != distance_col] + [distance_col]
joined = joined[columns]
joined[distance_col] = distances
return joined

View File

@@ -1,28 +1,28 @@
"""Tests for the clip module."""
import numpy as np
import pandas as pd
import shapely
from shapely.geometry import (
Polygon,
Point,
LineString,
LinearRing,
GeometryCollection,
LinearRing,
LineString,
MultiPoint,
Point,
Polygon,
box,
)
import geopandas
from geopandas import GeoDataFrame, GeoSeries, clip
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
import pytest
from geopandas._compat import HAS_PYPROJ
from geopandas.tools.clip import _mask_is_list_like_rectangle
pytestmark = pytest.mark.skip_no_sindex
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from pandas.testing import assert_index_equal
mask_variants_single_rectangle = [
"single_rectangle_gdf",
"single_rectangle_gdf_list_bounds",
@@ -43,6 +43,14 @@ def point_gdf():
return gdf
@pytest.fixture
def point_gdf2():
"""Create a point GeoDataFrame."""
pts = np.array([[5, 5], [2, 2], [4, 4], [0, 0], [3, 3], [1, 1]])
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
return gdf
@pytest.fixture
def pointsoutside_nooverlap_gdf():
"""Create a point GeoDataFrame. Its points are all outside the single
@@ -137,7 +145,7 @@ def two_line_gdf():
@pytest.fixture
def multi_poly_gdf(donut_geometry):
"""Create a multi-polygon GeoDataFrame."""
multi_poly = donut_geometry.unary_union
multi_poly = donut_geometry.union_all()
out_df = GeoDataFrame(geometry=GeoSeries(multi_poly), crs="EPSG:3857")
out_df["attr"] = ["pool"]
return out_df
@@ -148,7 +156,7 @@ def multi_line(two_line_gdf):
"""Create a multi-line GeoDataFrame.
This GDF has one multiline and one regular line."""
# Create a single and multi line object
multiline_feat = two_line_gdf.unary_union
multiline_feat = two_line_gdf.union_all()
linec = LineString([(2, 1), (3, 1), (4, 1), (5, 2)])
out_df = GeoDataFrame(geometry=GeoSeries([multiline_feat, linec]), crs="EPSG:3857")
out_df["attr"] = ["road", "stream"]
@@ -158,7 +166,7 @@ def multi_line(two_line_gdf):
@pytest.fixture
def multi_point(point_gdf):
"""Create a multi-point GeoDataFrame."""
multi_point = point_gdf.unary_union
multi_point = point_gdf.union_all()
out_df = GeoDataFrame(
geometry=GeoSeries(
[multi_point, Point(2, 5), Point(-11, -14), Point(-10, -12)]
@@ -321,7 +329,7 @@ class TestClipWithSingleRectangleGdf:
)
assert clipped.iloc[0].geometry.wkt == clipped_mutltipoint.wkt
shape_for_points = (
box(*mask) if _mask_is_list_like_rectangle(mask) else mask.unary_union
box(*mask) if _mask_is_list_like_rectangle(mask) else mask.union_all()
)
assert all(clipped.intersects(shape_for_points))
@@ -398,6 +406,7 @@ def test_clip_multipoly_keep_slivers(multi_poly_gdf, single_rectangle_gdf):
assert "GeometryCollection" in clipped.geom_type[0]
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
def test_warning_crs_mismatch(point_gdf, single_rectangle_gdf):
with pytest.warns(UserWarning, match="CRS mismatch between the CRS"):
clip(point_gdf, single_rectangle_gdf.to_crs(4326))
@@ -460,3 +469,16 @@ def test_clip_empty_mask(buffered_locations, mask):
)
clipped = clip(buffered_locations.geometry, mask)
assert_geoseries_equal(clipped, GeoSeries([], crs="EPSG:3857"))
def test_clip_sorting(point_gdf2):
"""Test the sorting kwarg in clip"""
bbox = shapely.geometry.box(0, 0, 2, 2)
unsorted_clipped_gdf = point_gdf2.clip(bbox)
sorted_clipped_gdf = point_gdf2.clip(bbox, sort=True)
expected_sorted_index = pd.Index([1, 3, 5])
assert not (sorted(unsorted_clipped_gdf.index) == unsorted_clipped_gdf.index).all()
assert (sorted(sorted_clipped_gdf.index) == sorted_clipped_gdf.index).all()
assert_index_equal(expected_sorted_index, sorted_clipped_gdf.index)

View File

@@ -1,4 +1,5 @@
import numpy as np
from shapely.geometry import Point
from shapely.wkt import loads

View File

@@ -1,28 +1,46 @@
import pytest
import numpy
import geopandas
import geopandas._compat as compat
import geopandas
from geopandas.tools._random import uniform
multipolygons = geopandas.read_file(geopandas.datasets.get_path("nybb")).geometry
polygons = multipolygons.explode(ignore_index=True).geometry
multilinestrings = multipolygons.boundary
linestrings = polygons.boundary
points = multipolygons.centroid
import pytest
@pytest.fixture
def multipolygons(nybb_filename):
return geopandas.read_file(nybb_filename).geometry
@pytest.fixture
def polygons(multipolygons):
return multipolygons.explode(ignore_index=True).geometry
@pytest.fixture
def multilinestrings(multipolygons):
return multipolygons.boundary
@pytest.fixture
def linestrings(polygons):
return polygons.boundary
@pytest.fixture
def points(multipolygons):
return multipolygons.centroid
@pytest.mark.skipif(
not (compat.USE_PYGEOS or compat.USE_SHAPELY_20),
reason="array input in interpolate not implemented for shapely<2",
)
@pytest.mark.parametrize("size", [10, 100])
@pytest.mark.parametrize(
"geom", [multipolygons[0], polygons[0], multilinestrings[0], linestrings[0]]
"geom_fixture", ["multipolygons", "polygons", "multilinestrings", "linestrings"]
)
def test_uniform(geom, size):
def test_uniform(geom_fixture, size, request):
geom = request.getfixturevalue(geom_fixture)[0]
sample = uniform(geom, size=size, rng=1)
sample_series = geopandas.GeoSeries(sample).explode().reset_index(drop=True)
sample_series = (
geopandas.GeoSeries(sample).explode(index_parts=True).reset_index(drop=True)
)
assert len(sample_series) == size
sample_in_geom = sample_series.buffer(0.00000001).sindex.query(
geom, predicate="intersects"
@@ -30,21 +48,13 @@ def test_uniform(geom, size):
assert len(sample_in_geom) == size
@pytest.mark.skipif(
not (compat.USE_PYGEOS or compat.USE_SHAPELY_20),
reason="array input in interpolate not implemented for shapely<2",
)
def test_uniform_unsupported():
def test_uniform_unsupported(points):
with pytest.warns(UserWarning, match="Sampling is not supported"):
sample = uniform(points[0], size=10, rng=1)
assert sample.is_empty
@pytest.mark.skipif(
not (compat.USE_PYGEOS or compat.USE_SHAPELY_20),
reason="array input in interpolate not implemented for shapely<2",
)
def test_uniform_generator():
def test_uniform_generator(polygons):
sample = uniform(polygons[0], size=10, rng=1)
sample2 = uniform(polygons[0], size=10, rng=1)
assert sample.equals(sample2)

View File

@@ -3,23 +3,24 @@ from typing import Sequence
import numpy as np
import pandas as pd
import shapely
from shapely.geometry import Point, Polygon, GeometryCollection
import shapely
from shapely.geometry import GeometryCollection, Point, Polygon, box
import geopandas
import geopandas._compat as compat
from geopandas import GeoDataFrame, GeoSeries, read_file, sjoin, sjoin_nearest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from geopandas import (
GeoDataFrame,
GeoSeries,
points_from_xy,
read_file,
sjoin,
sjoin_nearest,
)
from pandas.testing import assert_frame_equal, assert_series_equal
import pytest
TEST_NEAREST = compat.USE_SHAPELY_20 or (compat.PYGEOS_GE_010 and compat.USE_PYGEOS)
pytestmark = pytest.mark.skip_no_sindex
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal
@pytest.fixture()
@@ -95,6 +96,52 @@ def dfs(request):
return [request.param, df1, df2, expected]
@pytest.fixture()
def dfs_shared_attribute():
geo_left = [
Point(0, 0),
Point(1, 1),
Point(2, 2),
Point(3, 3),
Point(4, 4),
Point(5, 5),
Point(6, 6),
Point(7, 7),
]
geo_right = [
Point(0, 0),
Point(1, 1),
Point(2, 2),
Point(3, 3),
Point(4, 4),
Point(5, 5),
Point(6, 6),
Point(7, 7),
]
attr_tracker = ["A", "B", "C", "D", "E", "F", "G", "H"]
left_gdf = geopandas.GeoDataFrame(
{
"geometry": geo_left,
"attr_tracker": attr_tracker,
"duplicate_column": [0, 1, 2, 3, 4, 5, 6, 7],
"attr1": [True, True, True, True, True, True, True, True],
"attr2": [True, True, True, True, True, True, True, True],
}
)
right_gdf = geopandas.GeoDataFrame(
{
"geometry": geo_right,
"duplicate_column": [0, 1, 2, 3, 4, 5, 6, 7],
"attr1": [True, True, False, False, True, True, False, False],
"attr2": [True, True, False, False, False, False, False, False],
}
)
return left_gdf, right_gdf
class TestSpatialJoin:
@pytest.mark.parametrize(
"how, lsuffix, rsuffix, expected_cols",
@@ -113,6 +160,7 @@ class TestSpatialJoin:
joined = sjoin(left, right, how=how, lsuffix=lsuffix, rsuffix=rsuffix)
assert set(joined.columns) == expected_cols | {"geometry"}
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
@pytest.mark.parametrize("dfs", ["default-index", "string-index"], indirect=True)
def test_crs_mismatch(self, dfs):
index, df1, df2, expected = dfs
@@ -120,31 +168,6 @@ class TestSpatialJoin:
with pytest.warns(UserWarning, match="CRS mismatch between the CRS"):
sjoin(df1, df2)
@pytest.mark.parametrize("dfs", ["default-index"], indirect=True)
@pytest.mark.parametrize("op", ["intersects", "contains", "within"])
def test_deprecated_op_param(self, dfs, op):
_, df1, df2, _ = dfs
with pytest.warns(FutureWarning, match="`op` parameter is deprecated"):
sjoin(df1, df2, op=op)
@pytest.mark.parametrize("dfs", ["default-index"], indirect=True)
@pytest.mark.parametrize("op", ["intersects", "contains", "within"])
@pytest.mark.parametrize("predicate", ["contains", "within"])
def test_deprecated_op_param_nondefault_predicate(self, dfs, op, predicate):
_, df1, df2, _ = dfs
match = "use the `predicate` parameter instead"
if op != predicate:
warntype = UserWarning
match = (
"`predicate` will be overridden by the value of `op`" # noqa: ISC003
+ r"(.|\s)*"
+ match
)
else:
warntype = FutureWarning
with pytest.warns(warntype, match=match):
sjoin(df1, df2, predicate=predicate, op=op)
@pytest.mark.parametrize("dfs", ["default-index"], indirect=True)
def test_unknown_kwargs(self, dfs):
_, df1, df2, _ = dfs
@@ -154,7 +177,6 @@ class TestSpatialJoin:
):
sjoin(df1, df2, extra_param="test")
@pytest.mark.filterwarnings("ignore:The `op` parameter:FutureWarning")
@pytest.mark.parametrize(
"dfs",
[
@@ -167,12 +189,10 @@ class TestSpatialJoin:
indirect=True,
)
@pytest.mark.parametrize("predicate", ["intersects", "contains", "within"])
@pytest.mark.parametrize("predicate_kw", ["predicate", "op"])
def test_inner(self, predicate, predicate_kw, dfs):
def test_inner(self, predicate, dfs):
index, df1, df2, expected = dfs
res = sjoin(df1, df2, how="inner", **{predicate_kw: predicate})
res = sjoin(df1, df2, how="inner", predicate=predicate)
exp = expected[predicate].dropna().copy()
exp = exp.drop("geometry_y", axis=1).rename(columns={"geometry_x": "geometry"})
exp[["df1", "df2"]] = exp[["df1", "df2"]].astype("int64")
@@ -182,7 +202,7 @@ class TestSpatialJoin:
].astype("int64")
if index == "named-index":
exp[["df1_ix", "df2_ix"]] = exp[["df1_ix", "df2_ix"]].astype("int64")
exp = exp.set_index("df1_ix").rename(columns={"df2_ix": "index_right"})
exp = exp.set_index("df1_ix")
if index in ["default-index", "string-index"]:
exp = exp.set_index("index_left")
exp.index.name = None
@@ -192,11 +212,7 @@ class TestSpatialJoin:
)
exp.index.names = df1.index.names
if index == "named-multi-index":
exp = exp.set_index(["df1_ix1", "df1_ix2"]).rename(
columns={"df2_ix1": "index_right0", "df2_ix2": "index_right1"}
)
exp.index.names = df1.index.names
exp = exp.set_index(["df1_ix1", "df1_ix2"])
assert_frame_equal(res, exp)
@pytest.mark.parametrize(
@@ -232,7 +248,7 @@ class TestSpatialJoin:
res["index_right"] = res["index_right"].astype(float)
elif index == "named-index":
exp[["df1_ix"]] = exp[["df1_ix"]].astype("int64")
exp = exp.set_index("df1_ix").rename(columns={"df2_ix": "index_right"})
exp = exp.set_index("df1_ix")
if index in ["default-index", "string-index"]:
exp = exp.set_index("index_left")
exp.index.name = None
@@ -242,10 +258,7 @@ class TestSpatialJoin:
)
exp.index.names = df1.index.names
if index == "named-multi-index":
exp = exp.set_index(["df1_ix1", "df1_ix2"]).rename(
columns={"df2_ix1": "index_right0", "df2_ix2": "index_right1"}
)
exp.index.names = df1.index.names
exp = exp.set_index(["df1_ix1", "df1_ix2"])
assert_frame_equal(res, exp)
@@ -348,7 +361,7 @@ class TestSpatialJoin:
res["index_left"] = res["index_left"].astype(float)
elif index == "named-index":
exp[["df2_ix"]] = exp[["df2_ix"]].astype("int64")
exp = exp.set_index("df2_ix").rename(columns={"df1_ix": "index_left"})
exp = exp.set_index("df2_ix")
if index in ["default-index", "string-index"]:
exp = exp.set_index("index_right")
exp = exp.reindex(columns=res.columns)
@@ -359,20 +372,431 @@ class TestSpatialJoin:
)
exp.index.names = df2.index.names
if index == "named-multi-index":
exp = exp.set_index(["df2_ix1", "df2_ix2"]).rename(
columns={"df1_ix1": "index_left0", "df1_ix2": "index_left1"}
)
exp.index.names = df2.index.names
exp = exp.set_index(["df2_ix1", "df2_ix2"])
if predicate == "within":
exp = exp.sort_index()
assert_frame_equal(res, exp, check_index_type=False)
@pytest.mark.skipif(not compat.GEOS_GE_310, reason="`dwithin` requires GEOS 3.10")
@pytest.mark.parametrize("how", ["inner"])
@pytest.mark.parametrize(
"geo_left, geo_right, expected_left, expected_right, distance",
[
(
# Distance is number, 2x1
[Point(0, 0), Point(1, 1)],
[Point(1, 1)],
[0, 1],
[0, 0],
math.sqrt(2),
),
# Distance is number, 2x2
(
[Point(0, 0), Point(1, 1)],
[Point(0, 0), Point(1, 1)],
[0, 1, 0, 1],
[0, 0, 1, 1],
math.sqrt(2),
),
# Distance is array, matches len(left)
(
[Point(0, 0), Point(0, 0), Point(-1, -1)],
[Point(1, 1)],
[1, 2],
[0, 0],
[0, math.sqrt(2), math.sqrt(8)],
),
# Distance is np.array, matches len(left),
# inner join sorts the right GeoDataFrame
(
[Point(0, 0), Point(0, 0), Point(-1, -1)],
[Point(1, 1), Point(0.5, 0.5)],
[1, 2, 1, 2],
[1, 1, 0, 0],
np.array([0, math.sqrt(2), math.sqrt(8)]),
),
],
)
def test_sjoin_dwithin(
self,
geo_left,
geo_right,
expected_left: Sequence[int],
expected_right: Sequence[int],
distance,
how,
):
left = geopandas.GeoDataFrame({"geometry": geo_left})
right = geopandas.GeoDataFrame({"geometry": geo_right})
expected_gdf = left.iloc[expected_left].copy()
expected_gdf["index_right"] = expected_right
joined = sjoin(left, right, how=how, predicate="dwithin", distance=distance)
assert_frame_equal(expected_gdf.sort_index(), joined.sort_index())
# GH3239
@pytest.mark.parametrize(
"predicate",
[
"contains",
"contains_properly",
"covered_by",
"covers",
"crosses",
"intersects",
"touches",
"within",
],
)
def test_sjoin_left_order(self, predicate):
# a set of points in random order -> that order should be preserved
# with a left join
pts = GeoDataFrame(
geometry=points_from_xy([0.1, 0.4, 0.3, 0.7], [0.8, 0.6, 0.9, 0.1])
)
polys = GeoDataFrame(
{"id": [1, 2, 3, 4]},
geometry=[
box(0, 0, 0.5, 0.5),
box(0, 0.5, 0.5, 1),
box(0.5, 0, 1, 0.5),
box(0.5, 0.5, 1, 1),
],
)
joined = sjoin(pts, polys, predicate=predicate, how="left")
assert_index_equal(joined.index, pts.index)
def test_sjoin_shared_attribute(self, naturalearth_lowres, naturalearth_cities):
countries = read_file(naturalearth_lowres)
cities = read_file(naturalearth_cities)
countries = countries[["geometry", "name"]].rename(columns={"name": "country"})
# Add first letter of country/city as an attribute column to be compared
countries["firstLetter"] = countries["country"].astype(str).str[0]
cities["firstLetter"] = cities["name"].astype(str).str[0]
result = sjoin(cities, countries, on_attribute="firstLetter")
assert (
result["country"].astype(str).str[0] == result["name"].astype(str).str[0]
).all()
assert result.shape == (23, 5)
@pytest.mark.parametrize(
"attr1_key_change_dict, attr2_key_change_dict",
[
pytest.param(
{True: "merge", False: "no_merge"},
{True: "merge", False: "no_merge"},
id="merge on string attributes",
),
pytest.param(
{True: 2, False: 1},
{True: 2, False: 1},
id="merge on integer attributes",
),
pytest.param(
{True: True, False: False},
{True: True, False: False},
id="merge on boolean attributes",
),
pytest.param(
{True: True, False: False},
{True: "merge", False: "no_merge"},
id="merge on mixed attributes",
),
],
)
def test_sjoin_multiple_attributes_datatypes(
self, dfs_shared_attribute, attr1_key_change_dict, attr2_key_change_dict
):
left_gdf, right_gdf = dfs_shared_attribute
left_gdf["attr1"] = left_gdf["attr1"].map(attr1_key_change_dict)
left_gdf["attr2"] = left_gdf["attr2"].map(attr2_key_change_dict)
right_gdf["attr1"] = right_gdf["attr1"].map(attr1_key_change_dict)
right_gdf["attr2"] = right_gdf["attr2"].map(attr2_key_change_dict)
joined = sjoin(left_gdf, right_gdf, on_attribute=("attr1", "attr2"))
assert (["A", "B"] == joined["attr_tracker"].values).all()
def test_sjoin_multiple_attributes_check_header(self, dfs_shared_attribute):
left_gdf, right_gdf = dfs_shared_attribute
joined = sjoin(left_gdf, right_gdf, on_attribute=["attr1"])
assert (["A", "B", "E", "F"] == joined["attr_tracker"].values).all()
assert {"attr2_left", "attr2_right", "attr1"}.issubset(joined.columns)
assert "attr1_left" not in joined
def test_sjoin_error_column_does_not_exist(self, dfs_shared_attribute):
left_gdf, right_gdf = dfs_shared_attribute
right_gdf_dropped_attr = right_gdf.drop("attr1", axis=1)
left_gdf_dropped_attr = left_gdf.drop("attr1", axis=1)
with pytest.raises(
ValueError,
match="Expected column attr1 is missing from the right dataframe.",
):
sjoin(left_gdf, right_gdf_dropped_attr, on_attribute="attr1")
with pytest.raises(
ValueError,
match="Expected column attr1 is missing from the left dataframe.",
):
sjoin(left_gdf_dropped_attr, right_gdf, on_attribute="attr1")
with pytest.raises(
ValueError,
match="Expected column attr1 is missing from both of the dataframes.",
):
sjoin(left_gdf_dropped_attr, right_gdf_dropped_attr, on_attribute="attr1")
def test_sjoin_error_use_geometry_column(self, dfs_shared_attribute):
left_gdf, right_gdf = dfs_shared_attribute
with pytest.raises(
ValueError,
match="Active geometry column cannot be used as an input for "
"on_attribute parameter.",
):
sjoin(left_gdf, right_gdf, on_attribute="geometry")
with pytest.raises(
ValueError,
match="Active geometry column cannot be used as an input for "
"on_attribute parameter.",
):
sjoin(left_gdf, right_gdf, on_attribute=["attr1", "geometry"])
class TestIndexNames:
@pytest.mark.parametrize("how", ["inner", "left", "right"])
def test_preserve_index_names(self, how):
# preserve names of both left and right index
geoms = [Point(1, 1), Point(2, 2)]
df1 = GeoDataFrame({"geometry": geoms}, index=pd.Index([1, 2], name="myidx1"))
df2 = GeoDataFrame(
{"geometry": geoms}, index=pd.Index(["a", "b"], name="myidx2")
)
result = sjoin(df1, df2, how=how)
if how in ("inner", "left"):
expected = GeoDataFrame(
{"myidx1": [1, 2], "geometry": geoms, "myidx2": ["a", "b"]}
).set_index("myidx1")
else:
# right join
expected = GeoDataFrame(
{"myidx2": ["a", "b"], "myidx1": [1, 2], "geometry": geoms},
).set_index("myidx2")
assert_geodataframe_equal(result, expected)
# but also add suffixes if both left and right have the same index
df1.index.name = "myidx"
df2.index.name = "myidx"
result = sjoin(df1, df2, how=how)
if how in ("inner", "left"):
expected = GeoDataFrame(
{"myidx_left": [1, 2], "geometry": geoms, "myidx_right": ["a", "b"]}
).set_index("myidx_left")
else:
# right join
expected = GeoDataFrame(
{"myidx_right": ["a", "b"], "myidx_left": [1, 2], "geometry": geoms},
).set_index("myidx_right")
assert_geodataframe_equal(result, expected)
@pytest.mark.parametrize("how", ["inner", "left", "right"])
def test_preserve_index_names_multiindex(self, how):
# preserve names of both left and right index
geoms = [Point(1, 1), Point(2, 2)]
df1 = GeoDataFrame(
{"geometry": geoms},
index=pd.MultiIndex.from_tuples(
[("a", 1), ("b", 2)], names=["myidx1", "level2"]
),
)
df2 = GeoDataFrame(
{"geometry": geoms},
index=pd.MultiIndex.from_tuples(
[("c", 3), ("d", 4)], names=["myidx2", None]
),
)
result = sjoin(df1, df2, how=how)
expected_base = GeoDataFrame(
{
"myidx1": ["a", "b"],
"level2": [1, 2],
"geometry": geoms,
"myidx2": ["c", "d"],
"index_right1": [3, 4],
}
)
if how in ("inner", "left"):
expected = expected_base.set_index(["myidx1", "level2"])
else:
# right join
expected = expected_base.set_index(["myidx2", "index_right1"])
# if it was originally None, that is preserved
expected.index.names = ["myidx2", None]
assert_geodataframe_equal(result, expected)
# but also add suffixes if both left and right have the same index
df1.index.names = ["myidx", "level2"]
df2.index.names = ["myidx", None]
result = sjoin(df1, df2, how=how)
expected_base = GeoDataFrame(
{
"myidx_left": ["a", "b"],
"level2": [1, 2],
"geometry": geoms,
"myidx_right": ["c", "d"],
"index_right1": [3, 4],
}
)
if how in ("inner", "left"):
expected = expected_base.set_index(["myidx_left", "level2"])
else:
# right join
expected = expected_base.set_index(["myidx_right", "index_right1"])
# if it was originally None, that is preserved
expected.index.names = ["myidx_right", None]
assert_geodataframe_equal(result, expected)
@pytest.mark.parametrize("how", ["inner", "left", "right"])
def test_duplicate_column_index_name(self, how):
# case where a left column and the right index have the same name or the
# other way around -> correctly add suffix or preserve index name
geoms = [Point(1, 1), Point(2, 2)]
df1 = GeoDataFrame({"myidx": [1, 2], "geometry": geoms})
df2 = GeoDataFrame(
{"geometry": geoms}, index=pd.Index(["a", "b"], name="myidx")
)
result = sjoin(df1, df2, how=how)
if how in ("inner", "left"):
expected = GeoDataFrame(
{"myidx_left": [1, 2], "geometry": geoms, "myidx_right": ["a", "b"]}
)
else:
# right join
expected = GeoDataFrame(
{"index_left": [0, 1], "myidx_left": [1, 2], "geometry": geoms},
index=pd.Index(["a", "b"], name="myidx_right"),
)
assert_geodataframe_equal(result, expected)
result = sjoin(df2, df1, how=how)
if how in ("inner", "left"):
expected = GeoDataFrame(
{"geometry": geoms, "index_right": [0, 1], "myidx_right": [1, 2]},
index=pd.Index(["a", "b"], name="myidx_left"),
)
else:
# right join
expected = GeoDataFrame(
{"myidx_left": ["a", "b"], "myidx_right": [1, 2], "geometry": geoms},
)
assert_geodataframe_equal(result, expected)
@pytest.mark.parametrize("how", ["inner", "left", "right"])
def test_duplicate_column_index_name_multiindex(self, how):
# case where a left column and the right index have the same name or the
# other way around -> correctly add suffix or preserve index name
geoms = [Point(1, 1), Point(2, 2)]
df1 = GeoDataFrame({"myidx": [1, 2], "geometry": geoms})
df2 = GeoDataFrame(
{"geometry": geoms},
index=pd.MultiIndex.from_tuples(
[("a", 1), ("b", 2)], names=["myidx", "level2"]
),
)
result = sjoin(df1, df2, how=how)
if how in ("inner", "left"):
expected = GeoDataFrame(
{
"myidx_left": [1, 2],
"geometry": geoms,
"myidx_right": ["a", "b"],
"level2": [1, 2],
}
)
else:
# right join
expected = GeoDataFrame(
{"index_left": [0, 1], "myidx_left": [1, 2], "geometry": geoms},
index=pd.MultiIndex.from_tuples(
[("a", 1), ("b", 2)], names=["myidx_right", "level2"]
),
)
assert_geodataframe_equal(result, expected)
result = sjoin(df2, df1, how=how)
if how in ("inner", "left"):
expected = GeoDataFrame(
{"geometry": geoms, "index_right": [0, 1], "myidx_right": [1, 2]},
index=pd.MultiIndex.from_tuples(
[("a", 1), ("b", 2)], names=["myidx_left", "level2"]
),
)
else:
# right join
expected = GeoDataFrame(
{
"myidx_left": ["a", "b"],
"level2": [1, 2],
"myidx_right": [1, 2],
"geometry": geoms,
},
)
assert_geodataframe_equal(result, expected)
@pytest.mark.parametrize("how", ["inner", "left", "right"])
def test_conflicting_column_index_name(self, how):
# test case where the auto-generated index name conflicts
geoms = [Point(1, 1), Point(2, 2)]
df1 = GeoDataFrame({"index_right": [1, 2], "geometry": geoms})
df2 = GeoDataFrame({"geometry": geoms})
with pytest.raises(ValueError, match="'index_right' cannot be a column name"):
sjoin(df1, df2, how=how)
@pytest.mark.parametrize("how", ["inner", "left", "right"])
def test_conflicting_column_with_suffix(self, how):
# test case where the auto-generated index name conflicts
geoms = [Point(1, 1), Point(2, 2)]
df1 = GeoDataFrame(
{"column": [1, 2], "column_right": ["a", "b"], "geometry": geoms}
)
df2 = GeoDataFrame({"column": [0.1, 0.2], "geometry": geoms})
result = sjoin(df1, df2, how=how)
if how in ("inner", "left"):
expected = GeoDataFrame(
{1: [1, 2], 2: ["a", "b"], 3: geoms, 4: [0, 1], 5: [0.1, 0.2]}
)
expected.columns = [
"column_left",
"column_right",
"geometry",
"index_right",
"column_right",
]
else:
# right join
expected = GeoDataFrame(
{1: [0, 1], 2: [1, 2], 3: ["a", "b"], 4: [0.1, 0.2], 5: geoms}
)
expected.columns = [
"index_left",
"column_left",
"column_right",
"column_right",
"geometry",
]
expected = expected.set_geometry("geometry")
assert_geodataframe_equal(result, expected)
@pytest.mark.usefixtures("_setup_class_nybb_filename")
class TestSpatialJoinNYBB:
def setup_method(self):
nybb_filename = geopandas.datasets.get_path("nybb")
self.polydf = read_file(nybb_filename)
self.polydf = read_file(self.nybb_filename)
self.crs = self.polydf.crs
N = 20
b = [int(x) for x in self.polydf.total_bounds]
@@ -527,7 +951,7 @@ class TestSpatialJoinNYBB:
def test_sjoin_empty_geometries(self):
# https://github.com/geopandas/geopandas/issues/944
empty = GeoDataFrame(geometry=[GeometryCollection()] * 3)
empty = GeoDataFrame(geometry=[GeometryCollection()] * 3, crs=self.crs)
df = sjoin(pd.concat([self.pointdf, empty]), self.polydf, how="left")
assert df.shape == (24, 8)
df2 = sjoin(self.pointdf, pd.concat([self.polydf, empty]), how="left")
@@ -542,8 +966,8 @@ class TestSpatialJoinNYBB:
assert sjoin(empty, self.pointdf, how="inner", predicate=predicate).empty
assert sjoin(empty, self.pointdf, how="left", predicate=predicate).empty
def test_empty_sjoin_return_duplicated_columns(self):
nybb = geopandas.read_file(geopandas.datasets.get_path("nybb"))
def test_empty_sjoin_return_duplicated_columns(self, nybb_filename):
nybb = geopandas.read_file(nybb_filename)
nybb2 = nybb.copy()
nybb2.geometry = nybb2.translate(200000) # to get non-overlapping
@@ -553,45 +977,24 @@ class TestSpatialJoinNYBB:
assert "BoroCode_left" in result.columns
class TestSpatialJoinNaturalEarth:
def setup_method(self):
world_path = geopandas.datasets.get_path("naturalearth_lowres")
cities_path = geopandas.datasets.get_path("naturalearth_cities")
self.world = read_file(world_path)
self.cities = read_file(cities_path)
def test_sjoin_inner(self):
# GH637
countries = self.world[["geometry", "name"]]
countries = countries.rename(columns={"name": "country"})
cities_with_country = sjoin(
self.cities, countries, how="inner", predicate="intersects"
)
assert cities_with_country.shape == (213, 4)
@pytest.fixture
def world(naturalearth_lowres):
return read_file(naturalearth_lowres)
@pytest.mark.skipif(
TEST_NEAREST,
reason=("This test can only be run _without_ PyGEOS >= 0.10 installed"),
)
def test_no_nearest_all():
df1 = geopandas.GeoDataFrame({"geometry": []})
df2 = geopandas.GeoDataFrame({"geometry": []})
with pytest.raises(
NotImplementedError,
match="Currently, only PyGEOS >= 0.10.0 or Shapely >= 2.0 supports",
):
sjoin_nearest(df1, df2)
@pytest.fixture
def cities(naturalearth_cities):
return read_file(naturalearth_cities)
def test_sjoin_inner(world, cities):
# GH637
countries = world[["geometry", "name"]]
countries = countries.rename(columns={"name": "country"})
cities_with_country = sjoin(cities, countries, how="inner", predicate="intersects")
assert cities_with_country.shape == (213, 4)
@pytest.mark.skipif(
not TEST_NEAREST,
reason=(
"PyGEOS >= 0.10.0"
" must be installed and activated via the geopandas.compat module to"
" test sjoin_nearest"
),
)
class TestNearest:
@pytest.mark.parametrize(
"how_kwargs", ({}, {"how": "inner"}, {"how": "left"}, {"how": "right"})
@@ -900,10 +1303,10 @@ class TestNearest:
assert_geodataframe_equal(expected_gdf, joined)
@pytest.mark.filterwarnings("ignore:Geometry is in a geographic CRS")
def test_sjoin_nearest_inner(self):
def test_sjoin_nearest_inner(self, naturalearth_lowres, naturalearth_cities):
# check equivalency of left and inner join
countries = read_file(geopandas.datasets.get_path("naturalearth_lowres"))
cities = read_file(geopandas.datasets.get_path("naturalearth_cities"))
countries = read_file(naturalearth_lowres)
cities = read_file(naturalearth_cities)
countries = countries[["geometry", "name"]].rename(columns={"name": "country"})
# default: inner and left give the same result
@@ -927,19 +1330,8 @@ class TestNearest:
result5["index_right"] = result5["index_right"].astype("int64")
assert_geodataframe_equal(result5, result4, check_like=True)
expected_index_uncapped = (
[1, 3, 3, 1, 2] if compat.PANDAS_GE_22 else [1, 1, 3, 3, 2]
)
@pytest.mark.skipif(
not (compat.USE_SHAPELY_20),
reason=(
"shapely >= 2.0 is required to run sjoin_nearest"
"with parameter `exclusive` set"
),
)
@pytest.mark.parametrize(
"max_distance,expected", [(None, expected_index_uncapped), (1.1, [3, 3, 1, 2])]
"max_distance,expected", [(None, [1, 3, 3, 1, 2]), (1.1, [3, 3, 1, 2])]
)
def test_sjoin_nearest_exclusive(self, max_distance, expected):
geoms = shapely.points(np.arange(3), np.arange(3))