554 lines
20 KiB
Python
554 lines
20 KiB
Python
from typing import Optional
|
|
import warnings
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from geopandas import GeoDataFrame
|
|
from geopandas import _compat as compat
|
|
from geopandas.array import _check_crs, _crs_mismatch_warn
|
|
|
|
|
|
def sjoin(
|
|
left_df,
|
|
right_df,
|
|
how="inner",
|
|
predicate="intersects",
|
|
lsuffix="left",
|
|
rsuffix="right",
|
|
**kwargs,
|
|
):
|
|
"""Spatial join of two GeoDataFrames.
|
|
|
|
See the User Guide page :doc:`../../user_guide/mergingdata` for details.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
left_df, right_df : GeoDataFrames
|
|
how : string, default 'inner'
|
|
The type of join:
|
|
|
|
* 'left': use keys from left_df; retain only left_df geometry column
|
|
* 'right': use keys from right_df; retain only right_df geometry column
|
|
* 'inner': use intersection of keys from both dfs; retain only
|
|
left_df geometry column
|
|
predicate : string, default 'intersects'
|
|
Binary predicate. Valid values are determined by the spatial index used.
|
|
You can check the valid values in left_df or right_df as
|
|
``left_df.sindex.valid_query_predicates`` or
|
|
``right_df.sindex.valid_query_predicates``
|
|
Replaces deprecated ``op`` parameter.
|
|
lsuffix : string, default 'left'
|
|
Suffix to apply to overlapping column names (left GeoDataFrame).
|
|
rsuffix : string, default 'right'
|
|
Suffix to apply to overlapping column names (right GeoDataFrame).
|
|
|
|
Examples
|
|
--------
|
|
>>> import geodatasets
|
|
>>> chicago = geopandas.read_file(
|
|
... geodatasets.get_path("geoda.chicago_health")
|
|
... )
|
|
>>> groceries = geopandas.read_file(
|
|
... geodatasets.get_path("geoda.groceries")
|
|
... ).to_crs(chicago.crs)
|
|
|
|
>>> chicago.head() # doctest: +SKIP
|
|
ComAreaID ... geometry
|
|
0 35 ... POLYGON ((-87.60914 41.84469, -87.60915 41.844...
|
|
1 36 ... POLYGON ((-87.59215 41.81693, -87.59231 41.816...
|
|
2 37 ... POLYGON ((-87.62880 41.80189, -87.62879 41.801...
|
|
3 38 ... POLYGON ((-87.60671 41.81681, -87.60670 41.816...
|
|
4 39 ... POLYGON ((-87.59215 41.81693, -87.59215 41.816...
|
|
[5 rows x 87 columns]
|
|
|
|
>>> groceries.head() # doctest: +SKIP
|
|
OBJECTID Ycoord ... Category geometry
|
|
0 16 41.973266 ... NaN MULTIPOINT (-87.65661 41.97321)
|
|
1 18 41.696367 ... NaN MULTIPOINT (-87.68136 41.69713)
|
|
2 22 41.868634 ... NaN MULTIPOINT (-87.63918 41.86847)
|
|
3 23 41.877590 ... new MULTIPOINT (-87.65495 41.87783)
|
|
4 27 41.737696 ... NaN MULTIPOINT (-87.62715 41.73623)
|
|
[5 rows x 8 columns]
|
|
|
|
>>> groceries_w_communities = geopandas.sjoin(groceries, chicago)
|
|
>>> groceries_w_communities.head() # doctest: +SKIP
|
|
OBJECTID Ycoord Xcoord ... GonorrF GonorrM Tuberc
|
|
0 16 41.973266 -87.657073 ... 170.8 468.7 13.6
|
|
87 365 41.961707 -87.654058 ... 170.8 468.7 13.6
|
|
90 373 41.963131 -87.656352 ... 170.8 468.7 13.6
|
|
140 582 41.969131 -87.674882 ... 170.8 468.7 13.6
|
|
1 18 41.696367 -87.681315 ... 800.5 741.1 2.6
|
|
[5 rows x 95 columns]
|
|
|
|
See also
|
|
--------
|
|
overlay : overlay operation resulting in a new geometry
|
|
GeoDataFrame.sjoin : equivalent method
|
|
|
|
Notes
|
|
-----
|
|
Every operation in GeoPandas is planar, i.e. the potential third
|
|
dimension is not taken into account.
|
|
"""
|
|
if "op" in kwargs:
|
|
op = kwargs.pop("op")
|
|
deprecation_message = (
|
|
"The `op` parameter is deprecated and will be removed"
|
|
" in a future release. Please use the `predicate` parameter"
|
|
" instead."
|
|
)
|
|
if predicate != "intersects" and op != predicate:
|
|
override_message = (
|
|
"A non-default value for `predicate` was passed"
|
|
f' (got `predicate="{predicate}"`'
|
|
f' in combination with `op="{op}"`).'
|
|
" The value of `predicate` will be overridden by the value of `op`,"
|
|
" , which may result in unexpected behavior."
|
|
f"\n{deprecation_message}"
|
|
)
|
|
warnings.warn(override_message, UserWarning, stacklevel=4)
|
|
else:
|
|
warnings.warn(deprecation_message, FutureWarning, stacklevel=4)
|
|
predicate = op
|
|
if kwargs:
|
|
first = next(iter(kwargs.keys()))
|
|
raise TypeError(f"sjoin() got an unexpected keyword argument '{first}'")
|
|
|
|
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
|
|
|
|
indices = _geom_predicate_query(left_df, right_df, predicate)
|
|
|
|
joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
|
|
|
|
return joined
|
|
|
|
|
|
def _basic_checks(left_df, right_df, how, lsuffix, rsuffix):
|
|
"""Checks the validity of join input parameters.
|
|
|
|
`how` must be one of the valid options.
|
|
`'index_'` concatenated with `lsuffix` or `rsuffix` must not already
|
|
exist as columns in the left or right data frames.
|
|
|
|
Parameters
|
|
------------
|
|
left_df : GeoDataFrame
|
|
right_df : GeoData Frame
|
|
how : str, one of 'left', 'right', 'inner'
|
|
join type
|
|
lsuffix : str
|
|
left index suffix
|
|
rsuffix : str
|
|
right index suffix
|
|
"""
|
|
if not isinstance(left_df, GeoDataFrame):
|
|
raise ValueError(
|
|
"'left_df' should be GeoDataFrame, got {}".format(type(left_df))
|
|
)
|
|
|
|
if not isinstance(right_df, GeoDataFrame):
|
|
raise ValueError(
|
|
"'right_df' should be GeoDataFrame, got {}".format(type(right_df))
|
|
)
|
|
|
|
allowed_hows = ["left", "right", "inner"]
|
|
if how not in allowed_hows:
|
|
raise ValueError(
|
|
'`how` was "{}" but is expected to be in {}'.format(how, allowed_hows)
|
|
)
|
|
|
|
if not _check_crs(left_df, right_df):
|
|
_crs_mismatch_warn(left_df, right_df, stacklevel=4)
|
|
|
|
index_left = "index_{}".format(lsuffix)
|
|
index_right = "index_{}".format(rsuffix)
|
|
|
|
# due to GH 352
|
|
if any(left_df.columns.isin([index_left, index_right])) or any(
|
|
right_df.columns.isin([index_left, index_right])
|
|
):
|
|
raise ValueError(
|
|
"'{0}' and '{1}' cannot be names in the frames being"
|
|
" joined".format(index_left, index_right)
|
|
)
|
|
|
|
|
|
def _geom_predicate_query(left_df, right_df, predicate):
|
|
"""Compute geometric comparisons and get matching indices.
|
|
|
|
Parameters
|
|
----------
|
|
left_df : GeoDataFrame
|
|
right_df : GeoDataFrame
|
|
predicate : string
|
|
Binary predicate to query.
|
|
|
|
Returns
|
|
-------
|
|
DataFrame
|
|
DataFrame with matching indices in
|
|
columns named `_key_left` and `_key_right`.
|
|
"""
|
|
with warnings.catch_warnings():
|
|
# We don't need to show our own warning here
|
|
# TODO remove this once the deprecation has been enforced
|
|
warnings.filterwarnings(
|
|
"ignore", "Generated spatial index is empty", FutureWarning
|
|
)
|
|
|
|
original_predicate = predicate
|
|
|
|
if predicate == "within":
|
|
# within is implemented as the inverse of contains
|
|
# contains is a faster predicate
|
|
# see discussion at https://github.com/geopandas/geopandas/pull/1421
|
|
predicate = "contains"
|
|
sindex = left_df.sindex
|
|
input_geoms = right_df.geometry
|
|
else:
|
|
# all other predicates are symmetric
|
|
# keep them the same
|
|
sindex = right_df.sindex
|
|
input_geoms = left_df.geometry
|
|
|
|
if sindex:
|
|
l_idx, r_idx = sindex.query(input_geoms, predicate=predicate, sort=False)
|
|
indices = pd.DataFrame({"_key_left": l_idx, "_key_right": r_idx})
|
|
else:
|
|
# when sindex is empty / has no valid geometries
|
|
indices = pd.DataFrame(columns=["_key_left", "_key_right"], dtype=float)
|
|
|
|
if original_predicate == "within":
|
|
# within is implemented as the inverse of contains
|
|
# flip back the results
|
|
indices = indices.rename(
|
|
columns={"_key_left": "_key_right", "_key_right": "_key_left"}
|
|
)
|
|
|
|
return indices
|
|
|
|
|
|
def _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix):
|
|
"""Join the GeoDataFrames at the DataFrame level.
|
|
|
|
Parameters
|
|
----------
|
|
join_df : DataFrame
|
|
Indices and join data returned by the geometric join.
|
|
Must have columns `_key_left` and `_key_right`
|
|
with integer indices representing the matches
|
|
from `left_df` and `right_df` respectively.
|
|
Additional columns may be included and will be copied to
|
|
the resultant GeoDataFrame.
|
|
left_df : GeoDataFrame
|
|
right_df : GeoDataFrame
|
|
lsuffix : string
|
|
Suffix to apply to overlapping column names (left GeoDataFrame).
|
|
rsuffix : string
|
|
Suffix to apply to overlapping column names (right GeoDataFrame).
|
|
how : string
|
|
The type of join to use on the DataFrame level.
|
|
|
|
Returns
|
|
-------
|
|
GeoDataFrame
|
|
Joined GeoDataFrame.
|
|
"""
|
|
# the spatial index only allows limited (numeric) index types, but an
|
|
# index in geopandas may be any arbitrary dtype. so reset both indices now
|
|
# and store references to the original indices, to be reaffixed later.
|
|
# GH 352
|
|
index_left = "index_{}".format(lsuffix)
|
|
left_df = left_df.copy(deep=True)
|
|
try:
|
|
left_index_name = left_df.index.name
|
|
left_df.index = left_df.index.rename(index_left)
|
|
except TypeError:
|
|
index_left = [
|
|
"index_{}".format(lsuffix + str(pos))
|
|
for pos, ix in enumerate(left_df.index.names)
|
|
]
|
|
left_index_name = left_df.index.names
|
|
left_df.index = left_df.index.rename(index_left)
|
|
left_df = left_df.reset_index()
|
|
|
|
index_right = "index_{}".format(rsuffix)
|
|
right_df = right_df.copy(deep=True)
|
|
try:
|
|
right_index_name = right_df.index.name
|
|
right_df.index = right_df.index.rename(index_right)
|
|
except TypeError:
|
|
index_right = [
|
|
"index_{}".format(rsuffix + str(pos))
|
|
for pos, ix in enumerate(right_df.index.names)
|
|
]
|
|
right_index_name = right_df.index.names
|
|
right_df.index = right_df.index.rename(index_right)
|
|
right_df = right_df.reset_index()
|
|
|
|
# perform join on the dataframes
|
|
if how == "inner":
|
|
join_df = join_df.set_index("_key_left")
|
|
joined = (
|
|
left_df.merge(join_df, left_index=True, right_index=True)
|
|
.merge(
|
|
right_df.drop(right_df.geometry.name, axis=1),
|
|
left_on="_key_right",
|
|
right_index=True,
|
|
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
|
|
)
|
|
.set_index(index_left)
|
|
.drop(["_key_right"], axis=1)
|
|
)
|
|
if isinstance(index_left, list):
|
|
joined.index.names = left_index_name
|
|
else:
|
|
joined.index.name = left_index_name
|
|
|
|
elif how == "left":
|
|
join_df = join_df.set_index("_key_left")
|
|
joined = (
|
|
left_df.merge(join_df, left_index=True, right_index=True, how="left")
|
|
.merge(
|
|
right_df.drop(right_df.geometry.name, axis=1),
|
|
how="left",
|
|
left_on="_key_right",
|
|
right_index=True,
|
|
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
|
|
)
|
|
.set_index(index_left)
|
|
.drop(["_key_right"], axis=1)
|
|
)
|
|
if isinstance(index_left, list):
|
|
joined.index.names = left_index_name
|
|
else:
|
|
joined.index.name = left_index_name
|
|
|
|
else: # how == 'right':
|
|
joined = (
|
|
left_df.drop(left_df.geometry.name, axis=1)
|
|
.merge(
|
|
join_df.merge(
|
|
right_df, left_on="_key_right", right_index=True, how="right"
|
|
),
|
|
left_index=True,
|
|
right_on="_key_left",
|
|
how="right",
|
|
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
|
|
)
|
|
.set_index(index_right)
|
|
.drop(["_key_left", "_key_right"], axis=1)
|
|
.set_geometry(right_df.geometry.name)
|
|
)
|
|
if isinstance(index_right, list):
|
|
joined.index.names = right_index_name
|
|
else:
|
|
joined.index.name = right_index_name
|
|
|
|
return joined
|
|
|
|
|
|
def _nearest_query(
|
|
left_df: GeoDataFrame,
|
|
right_df: GeoDataFrame,
|
|
max_distance: float,
|
|
how: str,
|
|
return_distance: bool,
|
|
exclusive: bool,
|
|
):
|
|
if not (compat.USE_SHAPELY_20 or (compat.USE_PYGEOS and compat.PYGEOS_GE_010)):
|
|
raise NotImplementedError(
|
|
"Currently, only PyGEOS >= 0.10.0 or Shapely >= 2.0 supports "
|
|
"`nearest_all`. " + compat.INSTALL_PYGEOS_ERROR
|
|
)
|
|
|
|
# use the opposite of the join direction for the index
|
|
use_left_as_sindex = how == "right"
|
|
if use_left_as_sindex:
|
|
sindex = left_df.sindex
|
|
query = right_df.geometry
|
|
else:
|
|
sindex = right_df.sindex
|
|
query = left_df.geometry
|
|
if sindex:
|
|
res = sindex.nearest(
|
|
query,
|
|
return_all=True,
|
|
max_distance=max_distance,
|
|
return_distance=return_distance,
|
|
exclusive=exclusive,
|
|
)
|
|
if return_distance:
|
|
(input_idx, tree_idx), distances = res
|
|
else:
|
|
(input_idx, tree_idx) = res
|
|
distances = None
|
|
if use_left_as_sindex:
|
|
l_idx, r_idx = tree_idx, input_idx
|
|
sort_order = np.argsort(l_idx, kind="stable")
|
|
l_idx, r_idx = l_idx[sort_order], r_idx[sort_order]
|
|
if distances is not None:
|
|
distances = distances[sort_order]
|
|
else:
|
|
l_idx, r_idx = input_idx, tree_idx
|
|
join_df = pd.DataFrame(
|
|
{"_key_left": l_idx, "_key_right": r_idx, "distances": distances}
|
|
)
|
|
else:
|
|
# when sindex is empty / has no valid geometries
|
|
join_df = pd.DataFrame(
|
|
columns=["_key_left", "_key_right", "distances"], dtype=float
|
|
)
|
|
return join_df
|
|
|
|
|
|
def sjoin_nearest(
|
|
left_df: GeoDataFrame,
|
|
right_df: GeoDataFrame,
|
|
how: str = "inner",
|
|
max_distance: Optional[float] = None,
|
|
lsuffix: str = "left",
|
|
rsuffix: str = "right",
|
|
distance_col: Optional[str] = None,
|
|
exclusive: bool = False,
|
|
) -> GeoDataFrame:
|
|
"""Spatial join of two GeoDataFrames based on the distance between their geometries.
|
|
|
|
Results will include multiple output records for a single input record
|
|
where there are multiple equidistant nearest or intersected neighbors.
|
|
|
|
Distance is calculated in CRS units and can be returned using the
|
|
`distance_col` parameter.
|
|
|
|
See the User Guide page
|
|
https://geopandas.readthedocs.io/en/latest/docs/user_guide/mergingdata.html
|
|
for more details.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
left_df, right_df : GeoDataFrames
|
|
how : string, default 'inner'
|
|
The type of join:
|
|
|
|
* 'left': use keys from left_df; retain only left_df geometry column
|
|
* 'right': use keys from right_df; retain only right_df geometry column
|
|
* 'inner': use intersection of keys from both dfs; retain only
|
|
left_df geometry column
|
|
max_distance : float, default None
|
|
Maximum distance within which to query for nearest geometry.
|
|
Must be greater than 0.
|
|
The max_distance used to search for nearest items in the tree may have a
|
|
significant impact on performance by reducing the number of input
|
|
geometries that are evaluated for nearest items in the tree.
|
|
lsuffix : string, default 'left'
|
|
Suffix to apply to overlapping column names (left GeoDataFrame).
|
|
rsuffix : string, default 'right'
|
|
Suffix to apply to overlapping column names (right GeoDataFrame).
|
|
distance_col : string, default None
|
|
If set, save the distances computed between matching geometries under a
|
|
column of this name in the joined GeoDataFrame.
|
|
exclusive : bool, default False
|
|
If True, the nearest geometries that are equal to the input geometry
|
|
will not be returned, default False.
|
|
Requires Shapely >= 2.0.
|
|
|
|
Examples
|
|
--------
|
|
>>> import geodatasets
|
|
>>> groceries = geopandas.read_file(
|
|
... geodatasets.get_path("geoda.groceries")
|
|
... )
|
|
>>> chicago = geopandas.read_file(
|
|
... geodatasets.get_path("geoda.chicago_health")
|
|
... ).to_crs(groceries.crs)
|
|
|
|
>>> chicago.head() # doctest: +SKIP
|
|
ComAreaID ... geometry
|
|
0 35 ... POLYGON ((-87.60914 41.84469, -87.60915 41.844...
|
|
1 36 ... POLYGON ((-87.59215 41.81693, -87.59231 41.816...
|
|
2 37 ... POLYGON ((-87.62880 41.80189, -87.62879 41.801...
|
|
3 38 ... POLYGON ((-87.60671 41.81681, -87.60670 41.816...
|
|
4 39 ... POLYGON ((-87.59215 41.81693, -87.59215 41.816...
|
|
[5 rows x 87 columns]
|
|
|
|
>>> groceries.head() # doctest: +SKIP
|
|
OBJECTID Ycoord ... Category geometry
|
|
0 16 41.973266 ... NaN MULTIPOINT (-87.65661 41.97321)
|
|
1 18 41.696367 ... NaN MULTIPOINT (-87.68136 41.69713)
|
|
2 22 41.868634 ... NaN MULTIPOINT (-87.63918 41.86847)
|
|
3 23 41.877590 ... new MULTIPOINT (-87.65495 41.87783)
|
|
4 27 41.737696 ... NaN MULTIPOINT (-87.62715 41.73623)
|
|
[5 rows x 8 columns]
|
|
|
|
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago)
|
|
>>> groceries_w_communities[["Chain", "community", "geometry"]].head(2)
|
|
Chain community geometry
|
|
0 VIET HOA PLAZA UPTOWN MULTIPOINT (1168268.672 1933554.350)
|
|
87 JEWEL OSCO UPTOWN MULTIPOINT (1168837.980 1929246.962)
|
|
|
|
|
|
To include the distances:
|
|
|
|
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago, \
|
|
distance_col="distances")
|
|
>>> groceries_w_communities[["Chain", "community", \
|
|
"distances"]].head(2) # doctest: +SKIP
|
|
Chain community distances
|
|
0 VIET HOA PLAZA UPTOWN 0.0
|
|
87 JEWEL OSCO UPTOWN 0.0
|
|
|
|
In the following example, we get multiple groceries for Uptown because all
|
|
results are equidistant (in this case zero because they intersect).
|
|
In fact, we get 4 results in total:
|
|
|
|
>>> chicago_w_groceries = geopandas.sjoin_nearest(groceries, chicago, \
|
|
distance_col="distances", how="right")
|
|
>>> uptown_results = \
|
|
chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
|
|
>>> uptown_results[["Chain", "community"]] # doctest: +SKIP
|
|
Chain community
|
|
30 VIET HOA PLAZA UPTOWN
|
|
30 JEWEL OSCO UPTOWN
|
|
30 TARGET UPTOWN
|
|
30 Mariano's UPTOWN
|
|
|
|
See also
|
|
--------
|
|
sjoin : binary predicate joins
|
|
GeoDataFrame.sjoin_nearest : equivalent method
|
|
|
|
Notes
|
|
-----
|
|
Since this join relies on distances, results will be inaccurate
|
|
if your geometries are in a geographic CRS.
|
|
|
|
Every operation in GeoPandas is planar, i.e. the potential third
|
|
dimension is not taken into account.
|
|
"""
|
|
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
|
|
|
|
left_df.geometry.values.check_geographic_crs(stacklevel=1)
|
|
right_df.geometry.values.check_geographic_crs(stacklevel=1)
|
|
|
|
return_distance = distance_col is not None
|
|
|
|
join_df = _nearest_query(
|
|
left_df, right_df, max_distance, how, return_distance, exclusive
|
|
)
|
|
|
|
if return_distance:
|
|
join_df = join_df.rename(columns={"distances": distance_col})
|
|
else:
|
|
join_df.pop("distances")
|
|
|
|
joined = _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix)
|
|
|
|
if return_distance:
|
|
columns = [c for c in joined.columns if c != distance_col] + [distance_col]
|
|
joined = joined[columns]
|
|
|
|
return joined
|