1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
|
from typing import Optional
import warnings
import numpy as np
import pandas as pd
from geopandas import GeoDataFrame
from geopandas import _compat as compat
from geopandas.array import _check_crs, _crs_mismatch_warn
def sjoin(
left_df,
right_df,
how="inner",
predicate="intersects",
lsuffix="left",
rsuffix="right",
**kwargs,
):
"""Spatial join of two GeoDataFrames.
See the User Guide page :doc:`../../user_guide/mergingdata` for details.
Parameters
----------
left_df, right_df : GeoDataFrames
how : string, default 'inner'
The type of join:
* 'left': use keys from left_df; retain only left_df geometry column
* 'right': use keys from right_df; retain only right_df geometry column
* 'inner': use intersection of keys from both dfs; retain only
left_df geometry column
predicate : string, default 'intersects'
Binary predicate. Valid values are determined by the spatial index used.
You can check the valid values in left_df or right_df as
``left_df.sindex.valid_query_predicates`` or
``right_df.sindex.valid_query_predicates``
Replaces deprecated ``op`` parameter.
lsuffix : string, default 'left'
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string, default 'right'
Suffix to apply to overlapping column names (right GeoDataFrame).
Examples
--------
>>> countries = geopandas.read_file(geopandas.datasets.get_\
path("naturalearth_lowres"))
>>> cities = geopandas.read_file(geopandas.datasets.get_path("naturalearth_cities"))
>>> countries.head() # doctest: +SKIP
pop_est continent name \
iso_a3 gdp_md_est geometry
0 920938 Oceania Fiji FJI 8374.0 MULTIPOLY\
GON (((180.00000 -16.06713, 180.00000...
1 53950935 Africa Tanzania TZA 150600.0 POLYGON (\
(33.90371 -0.95000, 34.07262 -1.05982...
2 603253 Africa W. Sahara ESH 906.5 POLYGON (\
(-8.66559 27.65643, -8.66512 27.58948...
3 35623680 North America Canada CAN 1674000.0 MULTIPOLY\
GON (((-122.84000 49.00000, -122.9742...
4 326625791 North America United States of America USA 18560000.0 MULTIPOLY\
GON (((-122.84000 49.00000, -120.0000...
>>> cities.head()
name geometry
0 Vatican City POINT (12.45339 41.90328)
1 San Marino POINT (12.44177 43.93610)
2 Vaduz POINT (9.51667 47.13372)
3 Lobamba POINT (31.20000 -26.46667)
4 Luxembourg POINT (6.13000 49.61166)
>>> cities_w_country_data = geopandas.sjoin(cities, countries)
>>> cities_w_country_data.head() # doctest: +SKIP
name_left geometry index_right pop_est continent name_\
right iso_a3 gdp_md_est
0 Vatican City POINT (12.45339 41.90328) 141 62137802 Europe \
Italy ITA 2221000.0
1 San Marino POINT (12.44177 43.93610) 141 62137802 Europe \
Italy ITA 2221000.0
192 Rome POINT (12.48131 41.89790) 141 62137802 Europe \
Italy ITA 2221000.0
2 Vaduz POINT (9.51667 47.13372) 114 8754413 Europe Au\
stria AUT 416600.0
184 Vienna POINT (16.36469 48.20196) 114 8754413 Europe Au\
stria AUT 416600.0
See also
--------
overlay : overlay operation resulting in a new geometry
GeoDataFrame.sjoin : equivalent method
Notes
------
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
if "op" in kwargs:
op = kwargs.pop("op")
deprecation_message = (
"The `op` parameter is deprecated and will be removed"
" in a future release. Please use the `predicate` parameter"
" instead."
)
if predicate != "intersects" and op != predicate:
override_message = (
"A non-default value for `predicate` was passed"
f' (got `predicate="{predicate}"`'
f' in combination with `op="{op}"`).'
" The value of `predicate` will be overridden by the value of `op`,"
" , which may result in unexpected behavior."
f"\n{deprecation_message}"
)
warnings.warn(override_message, UserWarning, stacklevel=4)
else:
warnings.warn(deprecation_message, FutureWarning, stacklevel=4)
predicate = op
if kwargs:
first = next(iter(kwargs.keys()))
raise TypeError(f"sjoin() got an unexpected keyword argument '{first}'")
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
indices = _geom_predicate_query(left_df, right_df, predicate)
joined = _frame_join(indices, left_df, right_df, how, lsuffix, rsuffix)
return joined
def _basic_checks(left_df, right_df, how, lsuffix, rsuffix):
"""Checks the validity of join input parameters.
`how` must be one of the valid options.
`'index_'` concatenated with `lsuffix` or `rsuffix` must not already
exist as columns in the left or right data frames.
Parameters
------------
left_df : GeoDataFrame
right_df : GeoData Frame
how : str, one of 'left', 'right', 'inner'
join type
lsuffix : str
left index suffix
rsuffix : str
right index suffix
"""
if not isinstance(left_df, GeoDataFrame):
raise ValueError(
"'left_df' should be GeoDataFrame, got {}".format(type(left_df))
)
if not isinstance(right_df, GeoDataFrame):
raise ValueError(
"'right_df' should be GeoDataFrame, got {}".format(type(right_df))
)
allowed_hows = ["left", "right", "inner"]
if how not in allowed_hows:
raise ValueError(
'`how` was "{}" but is expected to be in {}'.format(how, allowed_hows)
)
if not _check_crs(left_df, right_df):
_crs_mismatch_warn(left_df, right_df, stacklevel=4)
index_left = "index_{}".format(lsuffix)
index_right = "index_{}".format(rsuffix)
# due to GH 352
if any(left_df.columns.isin([index_left, index_right])) or any(
right_df.columns.isin([index_left, index_right])
):
raise ValueError(
"'{0}' and '{1}' cannot be names in the frames being"
" joined".format(index_left, index_right)
)
def _geom_predicate_query(left_df, right_df, predicate):
"""Compute geometric comparisons and get matching indices.
Parameters
----------
left_df : GeoDataFrame
right_df : GeoDataFrame
predicate : string
Binary predicate to query.
Returns
-------
DataFrame
DataFrame with matching indices in
columns named `_key_left` and `_key_right`.
"""
with warnings.catch_warnings():
# We don't need to show our own warning here
# TODO remove this once the deprecation has been enforced
warnings.filterwarnings(
"ignore", "Generated spatial index is empty", FutureWarning
)
original_predicate = predicate
if predicate == "within":
# within is implemented as the inverse of contains
# contains is a faster predicate
# see discussion at https://github.com/geopandas/geopandas/pull/1421
predicate = "contains"
sindex = left_df.sindex
input_geoms = right_df.geometry
else:
# all other predicates are symmetric
# keep them the same
sindex = right_df.sindex
input_geoms = left_df.geometry
if sindex:
l_idx, r_idx = sindex.query_bulk(input_geoms, predicate=predicate, sort=False)
indices = pd.DataFrame({"_key_left": l_idx, "_key_right": r_idx})
else:
# when sindex is empty / has no valid geometries
indices = pd.DataFrame(columns=["_key_left", "_key_right"], dtype=float)
if original_predicate == "within":
# within is implemented as the inverse of contains
# flip back the results
indices = indices.rename(
columns={"_key_left": "_key_right", "_key_right": "_key_left"}
)
return indices
def _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix):
"""Join the GeoDataFrames at the DataFrame level.
Parameters
----------
join_df : DataFrame
Indices and join data returned by the geometric join.
Must have columns `_key_left` and `_key_right`
with integer indices representing the matches
from `left_df` and `right_df` respectively.
Additional columns may be included and will be copied to
the resultant GeoDataFrame.
left_df : GeoDataFrame
right_df : GeoDataFrame
lsuffix : string
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string
Suffix to apply to overlapping column names (right GeoDataFrame).
how : string
The type of join to use on the DataFrame level.
Returns
-------
GeoDataFrame
Joined GeoDataFrame.
"""
# the spatial index only allows limited (numeric) index types, but an
# index in geopandas may be any arbitrary dtype. so reset both indices now
# and store references to the original indices, to be reaffixed later.
# GH 352
index_left = "index_{}".format(lsuffix)
left_df = left_df.copy(deep=True)
try:
left_index_name = left_df.index.name
left_df.index = left_df.index.rename(index_left)
except TypeError:
index_left = [
"index_{}".format(lsuffix + str(pos))
for pos, ix in enumerate(left_df.index.names)
]
left_index_name = left_df.index.names
left_df.index = left_df.index.rename(index_left)
left_df = left_df.reset_index()
index_right = "index_{}".format(rsuffix)
right_df = right_df.copy(deep=True)
try:
right_index_name = right_df.index.name
right_df.index = right_df.index.rename(index_right)
except TypeError:
index_right = [
"index_{}".format(rsuffix + str(pos))
for pos, ix in enumerate(right_df.index.names)
]
right_index_name = right_df.index.names
right_df.index = right_df.index.rename(index_right)
right_df = right_df.reset_index()
# perform join on the dataframes
if how == "inner":
join_df = join_df.set_index("_key_left")
joined = (
left_df.merge(join_df, left_index=True, right_index=True)
.merge(
right_df.drop(right_df.geometry.name, axis=1),
left_on="_key_right",
right_index=True,
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
)
.set_index(index_left)
.drop(["_key_right"], axis=1)
)
if isinstance(index_left, list):
joined.index.names = left_index_name
else:
joined.index.name = left_index_name
elif how == "left":
join_df = join_df.set_index("_key_left")
joined = (
left_df.merge(join_df, left_index=True, right_index=True, how="left")
.merge(
right_df.drop(right_df.geometry.name, axis=1),
how="left",
left_on="_key_right",
right_index=True,
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
)
.set_index(index_left)
.drop(["_key_right"], axis=1)
)
if isinstance(index_left, list):
joined.index.names = left_index_name
else:
joined.index.name = left_index_name
else: # how == 'right':
joined = (
left_df.drop(left_df.geometry.name, axis=1)
.merge(
join_df.merge(
right_df, left_on="_key_right", right_index=True, how="right"
),
left_index=True,
right_on="_key_left",
how="right",
suffixes=("_{}".format(lsuffix), "_{}".format(rsuffix)),
)
.set_index(index_right)
.drop(["_key_left", "_key_right"], axis=1)
.set_geometry(right_df.geometry.name)
)
if isinstance(index_right, list):
joined.index.names = right_index_name
else:
joined.index.name = right_index_name
return joined
def _nearest_query(
left_df: GeoDataFrame,
right_df: GeoDataFrame,
max_distance: float,
how: str,
return_distance: bool,
):
if not (compat.USE_SHAPELY_20 or (compat.USE_PYGEOS and compat.PYGEOS_GE_010)):
raise NotImplementedError(
"Currently, only PyGEOS >= 0.10.0 or Shapely >= 2.0 supports "
"`nearest_all`. " + compat.INSTALL_PYGEOS_ERROR
)
# use the opposite of the join direction for the index
use_left_as_sindex = how == "right"
if use_left_as_sindex:
sindex = left_df.sindex
query = right_df.geometry
else:
sindex = right_df.sindex
query = left_df.geometry
if sindex:
res = sindex.nearest(
query,
return_all=True,
max_distance=max_distance,
return_distance=return_distance,
)
if return_distance:
(input_idx, tree_idx), distances = res
else:
(input_idx, tree_idx) = res
distances = None
if use_left_as_sindex:
l_idx, r_idx = tree_idx, input_idx
sort_order = np.argsort(l_idx, kind="stable")
l_idx, r_idx = l_idx[sort_order], r_idx[sort_order]
if distances is not None:
distances = distances[sort_order]
else:
l_idx, r_idx = input_idx, tree_idx
join_df = pd.DataFrame(
{"_key_left": l_idx, "_key_right": r_idx, "distances": distances}
)
else:
# when sindex is empty / has no valid geometries
join_df = pd.DataFrame(
columns=["_key_left", "_key_right", "distances"], dtype=float
)
return join_df
def sjoin_nearest(
left_df: GeoDataFrame,
right_df: GeoDataFrame,
how: str = "inner",
max_distance: Optional[float] = None,
lsuffix: str = "left",
rsuffix: str = "right",
distance_col: Optional[str] = None,
) -> GeoDataFrame:
"""Spatial join of two GeoDataFrames based on the distance between their geometries.
Results will include multiple output records for a single input record
where there are multiple equidistant nearest or intersected neighbors.
Distance is calculated in CRS units and can be returned using the
`distance_col` parameter.
See the User Guide page
https://geopandas.readthedocs.io/en/latest/docs/user_guide/mergingdata.html
for more details.
Parameters
----------
left_df, right_df : GeoDataFrames
how : string, default 'inner'
The type of join:
* 'left': use keys from left_df; retain only left_df geometry column
* 'right': use keys from right_df; retain only right_df geometry column
* 'inner': use intersection of keys from both dfs; retain only
left_df geometry column
max_distance : float, default None
Maximum distance within which to query for nearest geometry.
Must be greater than 0.
The max_distance used to search for nearest items in the tree may have a
significant impact on performance by reducing the number of input
geometries that are evaluated for nearest items in the tree.
lsuffix : string, default 'left'
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string, default 'right'
Suffix to apply to overlapping column names (right GeoDataFrame).
distance_col : string, default None
If set, save the distances computed between matching geometries under a
column of this name in the joined GeoDataFrame.
Examples
--------
>>> countries = geopandas.read_file(geopandas.datasets.get_\
path("naturalearth_lowres"))
>>> cities = geopandas.read_file(geopandas.datasets.get_path("naturalearth_cities"))
>>> countries.head(2).name # doctest: +SKIP
pop_est continent name \
iso_a3 gdp_md_est geometry
0 920938 Oceania Fiji FJI 8374.0 MULTIPOLY\
GON (((180.00000 -16.06713, 180.00000...
1 53950935 Africa Tanzania TZA 150600.0 POLYGON (\
(33.90371 -0.95000, 34.07262 -1.05982...
>>> cities.head(2).name # doctest: +SKIP
name geometry
0 Vatican City POINT (12.45339 41.90328)
1 San Marino POINT (12.44177 43.93610)
>>> cities_w_country_data = geopandas.sjoin_nearest(cities, countries)
>>> cities_w_country_data[['name_left', 'name_right']].head(2) # doctest: +SKIP
name_left geometry index_right pop_est continent name_\
right iso_a3 gdp_md_est
0 Vatican City POINT (12.45339 41.90328) 141 62137802 Europe \
Italy ITA 2221000.0
1 San Marino POINT (12.44177 43.93610) 141 62137802 Europe \
Italy ITA 2221000.0
To include the distances:
>>> cities_w_country_data = geopandas.sjoin_nearest\
(cities, countries, distance_col="distances")
>>> cities_w_country_data[["name_left", "name_right", \
"distances"]].head(2) # doctest: +SKIP
name_left name_right distances
0 Vatican City Italy 0.0
1 San Marino Italy 0.0
In the following example, we get multiple cities for Italy because all results are
equidistant (in this case zero because they intersect).
In fact, we get 3 results in total:
>>> countries_w_city_data = geopandas.sjoin_nearest\
(cities, countries, distance_col="distances", how="right")
>>> italy_results = \
countries_w_city_data[countries_w_city_data["name_left"] == "Italy"]
>>> italy_results # doctest: +SKIP
name_x name_y
141 Vatican City Italy
141 San Marino Italy
141 Rome Italy
See also
--------
sjoin : binary predicate joins
GeoDataFrame.sjoin_nearest : equivalent method
Notes
-----
Since this join relies on distances, results will be inaccurate
if your geometries are in a geographic CRS.
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
left_df.geometry.values.check_geographic_crs(stacklevel=1)
right_df.geometry.values.check_geographic_crs(stacklevel=1)
return_distance = distance_col is not None
join_df = _nearest_query(left_df, right_df, max_distance, how, return_distance)
if return_distance:
join_df = join_df.rename(columns={"distances": distance_col})
else:
join_df.pop("distances")
joined = _frame_join(join_df, left_df, right_df, how, lsuffix, rsuffix)
if return_distance:
columns = [c for c in joined.columns if c != distance_col] + [distance_col]
joined = joined[columns]
return joined
|