refactor: excel parse
This commit is contained in:
@@ -0,0 +1,15 @@
|
||||
from .clip import clip
|
||||
from .geocoding import geocode, reverse_geocode
|
||||
from .overlay import overlay
|
||||
from .sjoin import sjoin, sjoin_nearest
|
||||
from .util import collect
|
||||
|
||||
__all__ = [
|
||||
"collect",
|
||||
"geocode",
|
||||
"overlay",
|
||||
"reverse_geocode",
|
||||
"sjoin",
|
||||
"sjoin_nearest",
|
||||
"clip",
|
||||
]
|
||||
@@ -0,0 +1,84 @@
|
||||
from warnings import warn
|
||||
|
||||
import numpy
|
||||
|
||||
from shapely.geometry import MultiPoint
|
||||
|
||||
from geopandas.array import from_shapely, points_from_xy
|
||||
from geopandas.geoseries import GeoSeries
|
||||
|
||||
|
||||
def uniform(geom, size, rng=None):
|
||||
"""
|
||||
|
||||
Sample uniformly at random from a geometry.
|
||||
|
||||
For polygons, this samples uniformly within the area of the polygon. For lines,
|
||||
this samples uniformly along the length of the linestring. For multi-part
|
||||
geometries, the weights of each part are selected according to their relevant
|
||||
attribute (area for Polygons, length for LineStrings), and then points are
|
||||
sampled from each part uniformly.
|
||||
|
||||
Any other geometry type (e.g. Point, GeometryCollection) are ignored, and an
|
||||
empty MultiPoint geometry is returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
geom : any shapely.geometry.BaseGeometry type
|
||||
the shape that describes the area in which to sample.
|
||||
|
||||
size : integer
|
||||
an integer denoting how many points to sample
|
||||
|
||||
Returns
|
||||
-------
|
||||
shapely.MultiPoint geometry containing the sampled points
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from shapely.geometry import box
|
||||
>>> square = box(0,0,1,1)
|
||||
>>> uniform(square, size=102) # doctest: +SKIP
|
||||
"""
|
||||
generator = numpy.random.default_rng(seed=rng)
|
||||
|
||||
if geom is None or geom.is_empty:
|
||||
return MultiPoint()
|
||||
|
||||
if geom.geom_type in ("Polygon", "MultiPolygon"):
|
||||
return _uniform_polygon(geom, size=size, generator=generator)
|
||||
|
||||
if geom.geom_type in ("LineString", "MultiLineString"):
|
||||
return _uniform_line(geom, size=size, generator=generator)
|
||||
|
||||
warn(
|
||||
f"Sampling is not supported for {geom.geom_type} geometry type.",
|
||||
UserWarning,
|
||||
stacklevel=8,
|
||||
)
|
||||
return MultiPoint()
|
||||
|
||||
|
||||
def _uniform_line(geom, size, generator):
|
||||
"""
|
||||
Sample points from an input shapely linestring
|
||||
"""
|
||||
|
||||
fracs = generator.uniform(size=size)
|
||||
return from_shapely(geom.interpolate(fracs, normalized=True)).union_all()
|
||||
|
||||
|
||||
def _uniform_polygon(geom, size, generator):
|
||||
"""
|
||||
Sample uniformly from within a polygon using batched sampling.
|
||||
"""
|
||||
xmin, ymin, xmax, ymax = geom.bounds
|
||||
candidates = []
|
||||
while len(candidates) < size:
|
||||
batch = points_from_xy(
|
||||
x=generator.uniform(xmin, xmax, size=size),
|
||||
y=generator.uniform(ymin, ymax, size=size),
|
||||
)
|
||||
valid_samples = batch[batch.sindex.query(geom, predicate="contains")]
|
||||
candidates.extend(valid_samples)
|
||||
return GeoSeries(candidates[:size]).union_all()
|
||||
@@ -0,0 +1,169 @@
|
||||
import importlib
|
||||
import platform
|
||||
import sys
|
||||
|
||||
|
||||
def _get_sys_info():
|
||||
"""System information
|
||||
|
||||
Returns
|
||||
-------
|
||||
sys_info : dict
|
||||
system and Python version information
|
||||
"""
|
||||
python = sys.version.replace("\n", " ")
|
||||
|
||||
blob = [
|
||||
("python", python),
|
||||
("executable", sys.executable),
|
||||
("machine", platform.platform()),
|
||||
]
|
||||
|
||||
return dict(blob)
|
||||
|
||||
|
||||
def _get_C_info():
|
||||
"""Information on system PROJ, GDAL, GEOS
|
||||
Returns
|
||||
-------
|
||||
c_info: dict
|
||||
system PROJ information
|
||||
"""
|
||||
try:
|
||||
import pyproj
|
||||
|
||||
proj_version = pyproj.proj_version_str
|
||||
except Exception:
|
||||
proj_version = None
|
||||
try:
|
||||
import pyproj
|
||||
|
||||
proj_dir = pyproj.datadir.get_data_dir()
|
||||
except Exception:
|
||||
proj_dir = None
|
||||
|
||||
try:
|
||||
import shapely._buildcfg
|
||||
|
||||
geos_version = "{}.{}.{}".format(*shapely._buildcfg.geos_version)
|
||||
geos_dir = shapely._buildcfg.geos_library_path
|
||||
except Exception:
|
||||
try:
|
||||
from shapely import geos_version_string
|
||||
|
||||
geos_version = geos_version_string
|
||||
geos_dir = None
|
||||
except Exception:
|
||||
geos_version = None
|
||||
geos_dir = None
|
||||
|
||||
try:
|
||||
import pyogrio
|
||||
|
||||
gdal_version = pyogrio.__gdal_version_string__
|
||||
gdal_dir = pyogrio.get_gdal_data_path()
|
||||
except Exception:
|
||||
gdal_version = None
|
||||
gdal_dir = None
|
||||
|
||||
if gdal_version is None:
|
||||
try:
|
||||
import fiona
|
||||
|
||||
gdal_version = fiona.env.get_gdal_release_name()
|
||||
except Exception:
|
||||
gdal_version = None
|
||||
try:
|
||||
import fiona
|
||||
|
||||
gdal_dir = fiona.env.GDALDataFinder().search()
|
||||
except Exception:
|
||||
gdal_dir = None
|
||||
|
||||
blob = [
|
||||
("GEOS", geos_version),
|
||||
("GEOS lib", geos_dir),
|
||||
("GDAL", gdal_version),
|
||||
("GDAL data dir", gdal_dir),
|
||||
("PROJ", proj_version),
|
||||
("PROJ data dir", proj_dir),
|
||||
]
|
||||
|
||||
return dict(blob)
|
||||
|
||||
|
||||
def _get_deps_info():
|
||||
"""Overview of the installed version of main dependencies
|
||||
|
||||
Returns
|
||||
-------
|
||||
deps_info: dict
|
||||
version information on relevant Python libraries
|
||||
"""
|
||||
deps = [
|
||||
"geopandas",
|
||||
# required deps
|
||||
"numpy",
|
||||
"pandas",
|
||||
"pyproj",
|
||||
"shapely",
|
||||
# optional deps
|
||||
"pyogrio",
|
||||
"geoalchemy2",
|
||||
"geopy",
|
||||
"matplotlib",
|
||||
"mapclassify",
|
||||
"fiona",
|
||||
"psycopg",
|
||||
"psycopg2",
|
||||
"pyarrow",
|
||||
]
|
||||
|
||||
def get_version(module):
|
||||
return module.__version__
|
||||
|
||||
deps_info = {}
|
||||
|
||||
for modname in deps:
|
||||
try:
|
||||
if modname in sys.modules:
|
||||
mod = sys.modules[modname]
|
||||
else:
|
||||
mod = importlib.import_module(modname)
|
||||
ver = get_version(mod)
|
||||
deps_info[modname] = ver
|
||||
except Exception:
|
||||
deps_info[modname] = None
|
||||
|
||||
return deps_info
|
||||
|
||||
|
||||
def show_versions():
|
||||
"""
|
||||
Print system information and installed module versions.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
::
|
||||
|
||||
$ python -c "import geopandas; geopandas.show_versions()"
|
||||
"""
|
||||
sys_info = _get_sys_info()
|
||||
deps_info = _get_deps_info()
|
||||
proj_info = _get_C_info()
|
||||
|
||||
maxlen = max(len(x) for x in deps_info)
|
||||
tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen)
|
||||
print("\nSYSTEM INFO")
|
||||
print("-----------")
|
||||
for k, stat in sys_info.items():
|
||||
print(tpl.format(k=k, stat=stat))
|
||||
print("\nGEOS, GDAL, PROJ INFO")
|
||||
print("---------------------")
|
||||
for k, stat in proj_info.items():
|
||||
print(tpl.format(k=k, stat=stat))
|
||||
print("\nPYTHON DEPENDENCIES")
|
||||
print("-------------------")
|
||||
for k, stat in deps_info.items():
|
||||
print(tpl.format(k=k, stat=stat))
|
||||
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
geopandas.clip
|
||||
==============
|
||||
|
||||
A module to clip vector data using GeoPandas.
|
||||
|
||||
"""
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas.api.types
|
||||
|
||||
from shapely.geometry import MultiPolygon, Polygon, box
|
||||
|
||||
from geopandas import GeoDataFrame, GeoSeries
|
||||
from geopandas.array import _check_crs, _crs_mismatch_warn
|
||||
|
||||
|
||||
def _mask_is_list_like_rectangle(mask):
|
||||
return pandas.api.types.is_list_like(mask) and not isinstance(
|
||||
mask, (GeoDataFrame, GeoSeries, Polygon, MultiPolygon)
|
||||
)
|
||||
|
||||
|
||||
def _clip_gdf_with_mask(gdf, mask, sort=False):
|
||||
"""Clip geometry to the polygon/rectangle extent.
|
||||
|
||||
Clip an input GeoDataFrame to the polygon extent of the polygon
|
||||
parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
gdf : GeoDataFrame, GeoSeries
|
||||
Dataframe to clip.
|
||||
|
||||
mask : (Multi)Polygon, list-like
|
||||
Reference polygon/rectangle for clipping.
|
||||
|
||||
sort : boolean, default False
|
||||
If True, the results will be sorted in ascending order using the
|
||||
geometries' indexes as the primary key.
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
The returned GeoDataFrame is a clipped subset of gdf
|
||||
that intersects with polygon/rectangle.
|
||||
"""
|
||||
clipping_by_rectangle = _mask_is_list_like_rectangle(mask)
|
||||
if clipping_by_rectangle:
|
||||
intersection_polygon = box(*mask)
|
||||
else:
|
||||
intersection_polygon = mask
|
||||
|
||||
gdf_sub = gdf.iloc[
|
||||
gdf.sindex.query(intersection_polygon, predicate="intersects", sort=sort)
|
||||
]
|
||||
|
||||
# For performance reasons points don't need to be intersected with poly
|
||||
non_point_mask = gdf_sub.geom_type != "Point"
|
||||
|
||||
if not non_point_mask.any():
|
||||
# only points, directly return
|
||||
return gdf_sub
|
||||
|
||||
# Clip the data with the polygon
|
||||
if isinstance(gdf_sub, GeoDataFrame):
|
||||
clipped = gdf_sub.copy()
|
||||
if clipping_by_rectangle:
|
||||
clipped.loc[non_point_mask, clipped._geometry_column_name] = (
|
||||
gdf_sub.geometry.values[non_point_mask].clip_by_rect(*mask)
|
||||
)
|
||||
else:
|
||||
clipped.loc[non_point_mask, clipped._geometry_column_name] = (
|
||||
gdf_sub.geometry.values[non_point_mask].intersection(mask)
|
||||
)
|
||||
else:
|
||||
# GeoSeries
|
||||
clipped = gdf_sub.copy()
|
||||
if clipping_by_rectangle:
|
||||
clipped[non_point_mask] = gdf_sub.values[non_point_mask].clip_by_rect(*mask)
|
||||
else:
|
||||
clipped[non_point_mask] = gdf_sub.values[non_point_mask].intersection(mask)
|
||||
|
||||
if clipping_by_rectangle:
|
||||
# clip_by_rect might return empty geometry collections in edge cases
|
||||
clipped = clipped[~clipped.is_empty]
|
||||
return clipped
|
||||
|
||||
|
||||
def clip(gdf, mask, keep_geom_type=False, sort=False):
|
||||
"""Clip points, lines, or polygon geometries to the mask extent.
|
||||
|
||||
Both layers must be in the same Coordinate Reference System (CRS).
|
||||
The ``gdf`` will be clipped to the full extent of the clip object.
|
||||
|
||||
If there are multiple polygons in mask, data from ``gdf`` will be
|
||||
clipped to the total boundary of all polygons in mask.
|
||||
|
||||
If the ``mask`` is list-like with four elements ``(minx, miny, maxx, maxy)``, a
|
||||
faster rectangle clipping algorithm will be used. Note that this can lead to
|
||||
slightly different results in edge cases, e.g. if a line would be reduced to a
|
||||
point, this point might not be returned.
|
||||
The geometry is clipped in a fast but possibly dirty way. The output is not
|
||||
guaranteed to be valid. No exceptions will be raised for topological errors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
gdf : GeoDataFrame or GeoSeries
|
||||
Vector layer (point, line, polygon) to be clipped to mask.
|
||||
mask : GeoDataFrame, GeoSeries, (Multi)Polygon, list-like
|
||||
Polygon vector layer used to clip ``gdf``.
|
||||
The mask's geometry is dissolved into one geometric feature
|
||||
and intersected with ``gdf``.
|
||||
If the mask is list-like with four elements ``(minx, miny, maxx, maxy)``,
|
||||
``clip`` will use a faster rectangle clipping (:meth:`~GeoSeries.clip_by_rect`),
|
||||
possibly leading to slightly different results.
|
||||
keep_geom_type : boolean, default False
|
||||
If True, return only geometries of original type in case of intersection
|
||||
resulting in multiple geometry types or GeometryCollections.
|
||||
If False, return all resulting geometries (potentially mixed-types).
|
||||
sort : boolean, default False
|
||||
If True, the results will be sorted in ascending order using the
|
||||
geometries' indexes as the primary key.
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame or GeoSeries
|
||||
Vector data (points, lines, polygons) from ``gdf`` clipped to
|
||||
polygon boundary from mask.
|
||||
|
||||
See also
|
||||
--------
|
||||
GeoDataFrame.clip : equivalent GeoDataFrame method
|
||||
GeoSeries.clip : equivalent GeoSeries method
|
||||
|
||||
Examples
|
||||
--------
|
||||
Clip points (grocery stores) with polygons (the Near West Side community):
|
||||
|
||||
>>> import geodatasets
|
||||
>>> chicago = geopandas.read_file(
|
||||
... geodatasets.get_path("geoda.chicago_health")
|
||||
... )
|
||||
>>> near_west_side = chicago[chicago["community"] == "NEAR WEST SIDE"]
|
||||
>>> groceries = geopandas.read_file(
|
||||
... geodatasets.get_path("geoda.groceries")
|
||||
... ).to_crs(chicago.crs)
|
||||
>>> groceries.shape
|
||||
(148, 8)
|
||||
|
||||
>>> nws_groceries = geopandas.clip(groceries, near_west_side)
|
||||
>>> nws_groceries.shape
|
||||
(7, 8)
|
||||
"""
|
||||
if not isinstance(gdf, (GeoDataFrame, GeoSeries)):
|
||||
raise TypeError(
|
||||
"'gdf' should be GeoDataFrame or GeoSeries, got {}".format(type(gdf))
|
||||
)
|
||||
|
||||
mask_is_list_like = _mask_is_list_like_rectangle(mask)
|
||||
if (
|
||||
not isinstance(mask, (GeoDataFrame, GeoSeries, Polygon, MultiPolygon))
|
||||
and not mask_is_list_like
|
||||
):
|
||||
raise TypeError(
|
||||
"'mask' should be GeoDataFrame, GeoSeries,"
|
||||
f"(Multi)Polygon or list-like, got {type(mask)}"
|
||||
)
|
||||
|
||||
if mask_is_list_like and len(mask) != 4:
|
||||
raise TypeError(
|
||||
"If 'mask' is list-like, it must have four values (minx, miny, maxx, maxy)"
|
||||
)
|
||||
|
||||
if isinstance(mask, (GeoDataFrame, GeoSeries)):
|
||||
if not _check_crs(gdf, mask):
|
||||
_crs_mismatch_warn(gdf, mask, stacklevel=3)
|
||||
|
||||
if isinstance(mask, (GeoDataFrame, GeoSeries)):
|
||||
box_mask = mask.total_bounds
|
||||
elif mask_is_list_like:
|
||||
box_mask = mask
|
||||
else:
|
||||
# Avoid empty tuple returned by .bounds when geometry is empty. A tuple of
|
||||
# all nan values is consistent with the behavior of
|
||||
# {GeoSeries, GeoDataFrame}.total_bounds for empty geometries.
|
||||
# TODO(shapely) can simpely use mask.bounds once relying on Shapely 2.0
|
||||
box_mask = mask.bounds if not mask.is_empty else (np.nan,) * 4
|
||||
box_gdf = gdf.total_bounds
|
||||
if not (
|
||||
((box_mask[0] <= box_gdf[2]) and (box_gdf[0] <= box_mask[2]))
|
||||
and ((box_mask[1] <= box_gdf[3]) and (box_gdf[1] <= box_mask[3]))
|
||||
):
|
||||
return gdf.iloc[:0]
|
||||
|
||||
if isinstance(mask, (GeoDataFrame, GeoSeries)):
|
||||
combined_mask = mask.geometry.union_all()
|
||||
else:
|
||||
combined_mask = mask
|
||||
|
||||
clipped = _clip_gdf_with_mask(gdf, combined_mask, sort=sort)
|
||||
|
||||
if keep_geom_type:
|
||||
geomcoll_concat = (clipped.geom_type == "GeometryCollection").any()
|
||||
geomcoll_orig = (gdf.geom_type == "GeometryCollection").any()
|
||||
|
||||
new_collection = geomcoll_concat and not geomcoll_orig
|
||||
|
||||
if geomcoll_orig:
|
||||
warnings.warn(
|
||||
"keep_geom_type can not be called on a "
|
||||
"GeoDataFrame with GeometryCollection.",
|
||||
stacklevel=2,
|
||||
)
|
||||
else:
|
||||
polys = ["Polygon", "MultiPolygon"]
|
||||
lines = ["LineString", "MultiLineString", "LinearRing"]
|
||||
points = ["Point", "MultiPoint"]
|
||||
|
||||
# Check that the gdf for multiple geom types (points, lines and/or polys)
|
||||
orig_types_total = sum(
|
||||
[
|
||||
gdf.geom_type.isin(polys).any(),
|
||||
gdf.geom_type.isin(lines).any(),
|
||||
gdf.geom_type.isin(points).any(),
|
||||
]
|
||||
)
|
||||
|
||||
# Check how many geometry types are in the clipped GeoDataFrame
|
||||
clip_types_total = sum(
|
||||
[
|
||||
clipped.geom_type.isin(polys).any(),
|
||||
clipped.geom_type.isin(lines).any(),
|
||||
clipped.geom_type.isin(points).any(),
|
||||
]
|
||||
)
|
||||
|
||||
# Check there aren't any new geom types in the clipped GeoDataFrame
|
||||
more_types = orig_types_total < clip_types_total
|
||||
|
||||
if orig_types_total > 1:
|
||||
warnings.warn(
|
||||
"keep_geom_type can not be called on a mixed type GeoDataFrame.",
|
||||
stacklevel=2,
|
||||
)
|
||||
elif new_collection or more_types:
|
||||
orig_type = gdf.geom_type.iloc[0]
|
||||
if new_collection:
|
||||
clipped = clipped.explode(index_parts=False)
|
||||
if orig_type in polys:
|
||||
clipped = clipped.loc[clipped.geom_type.isin(polys)]
|
||||
elif orig_type in lines:
|
||||
clipped = clipped.loc[clipped.geom_type.isin(lines)]
|
||||
|
||||
return clipped
|
||||
@@ -0,0 +1,184 @@
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from shapely.geometry import Point
|
||||
|
||||
import geopandas
|
||||
|
||||
|
||||
def _get_throttle_time(provider):
|
||||
"""
|
||||
Amount of time to wait between requests to a geocoding API, for providers
|
||||
that specify rate limits in their terms of service.
|
||||
"""
|
||||
import geopy.geocoders
|
||||
|
||||
# https://operations.osmfoundation.org/policies/nominatim/
|
||||
if provider == geopy.geocoders.Nominatim:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def geocode(strings, provider=None, **kwargs):
|
||||
"""
|
||||
Geocode a set of strings and get a GeoDataFrame of the resulting points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
strings : list or Series of addresses to geocode
|
||||
provider : str or geopy.geocoder
|
||||
Specifies geocoding service to use. If none is provided,
|
||||
will use 'photon' (see the Photon's terms of service at:
|
||||
https://photon.komoot.io).
|
||||
|
||||
Either the string name used by geopy (as specified in
|
||||
geopy.geocoders.SERVICE_TO_GEOCODER) or a geopy Geocoder instance
|
||||
(e.g., geopy.geocoders.Photon) may be used.
|
||||
|
||||
Some providers require additional arguments such as access keys
|
||||
See each geocoder's specific parameters in geopy.geocoders
|
||||
|
||||
Notes
|
||||
-----
|
||||
Ensure proper use of the results by consulting the Terms of Service for
|
||||
your provider.
|
||||
|
||||
Geocoding requires geopy. Install it using 'pip install geopy'. See also
|
||||
https://github.com/geopy/geopy
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = geopandas.tools.geocode( # doctest: +SKIP
|
||||
... ["boston, ma", "1600 pennsylvania ave. washington, dc"]
|
||||
... )
|
||||
>>> df # doctest: +SKIP
|
||||
geometry address
|
||||
0 POINT (-71.05863 42.35899) Boston, MA, United States
|
||||
1 POINT (-77.03651 38.89766) 1600 Pennsylvania Ave NW, Washington, DC 20006...
|
||||
"""
|
||||
|
||||
if provider is None:
|
||||
provider = "photon"
|
||||
throttle_time = _get_throttle_time(provider)
|
||||
|
||||
return _query(strings, True, provider, throttle_time, **kwargs)
|
||||
|
||||
|
||||
def reverse_geocode(points, provider=None, **kwargs):
|
||||
"""
|
||||
Reverse geocode a set of points and get a GeoDataFrame of the resulting
|
||||
addresses.
|
||||
|
||||
The points
|
||||
|
||||
Parameters
|
||||
----------
|
||||
points : list or Series of Shapely Point objects.
|
||||
x coordinate is longitude
|
||||
y coordinate is latitude
|
||||
provider : str or geopy.geocoder (opt)
|
||||
Specifies geocoding service to use. If none is provided,
|
||||
will use 'photon' (see the Photon's terms of service at:
|
||||
https://photon.komoot.io).
|
||||
|
||||
Either the string name used by geopy (as specified in
|
||||
geopy.geocoders.SERVICE_TO_GEOCODER) or a geopy Geocoder instance
|
||||
(e.g., geopy.geocoders.Photon) may be used.
|
||||
|
||||
Some providers require additional arguments such as access keys
|
||||
See each geocoder's specific parameters in geopy.geocoders
|
||||
|
||||
Notes
|
||||
-----
|
||||
Ensure proper use of the results by consulting the Terms of Service for
|
||||
your provider.
|
||||
|
||||
Reverse geocoding requires geopy. Install it using 'pip install geopy'.
|
||||
See also https://github.com/geopy/geopy
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from shapely.geometry import Point
|
||||
>>> df = geopandas.tools.reverse_geocode( # doctest: +SKIP
|
||||
... [Point(-71.0594869, 42.3584697), Point(-77.0365305, 38.8977332)]
|
||||
... )
|
||||
>>> df # doctest: +SKIP
|
||||
geometry address
|
||||
0 POINT (-71.05941 42.35837) 29 Court Sq, Boston, MA 02108, United States
|
||||
1 POINT (-77.03641 38.89766) 1600 Pennsylvania Ave NW, Washington, DC 20006...
|
||||
"""
|
||||
|
||||
if provider is None:
|
||||
provider = "photon"
|
||||
throttle_time = _get_throttle_time(provider)
|
||||
|
||||
return _query(points, False, provider, throttle_time, **kwargs)
|
||||
|
||||
|
||||
def _query(data, forward, provider, throttle_time, **kwargs):
|
||||
# generic wrapper for calls over lists to geopy Geocoders
|
||||
from geopy.geocoders import get_geocoder_for_service
|
||||
from geopy.geocoders.base import GeocoderQueryError
|
||||
|
||||
if forward:
|
||||
if not isinstance(data, pd.Series):
|
||||
data = pd.Series(data)
|
||||
else:
|
||||
if not isinstance(data, geopandas.GeoSeries):
|
||||
data = geopandas.GeoSeries(data)
|
||||
|
||||
if isinstance(provider, str):
|
||||
provider = get_geocoder_for_service(provider)
|
||||
|
||||
coder = provider(**kwargs)
|
||||
results = {}
|
||||
for i, s in data.items():
|
||||
try:
|
||||
if forward:
|
||||
results[i] = coder.geocode(s)
|
||||
else:
|
||||
results[i] = coder.reverse((s.y, s.x), exactly_one=True)
|
||||
except (GeocoderQueryError, ValueError):
|
||||
results[i] = (None, None)
|
||||
time.sleep(throttle_time)
|
||||
|
||||
df = _prepare_geocode_result(results)
|
||||
return df
|
||||
|
||||
|
||||
def _prepare_geocode_result(results):
|
||||
"""
|
||||
Helper function for the geocode function
|
||||
|
||||
Takes a dict where keys are index entries, values are tuples containing:
|
||||
(address, (lat, lon))
|
||||
|
||||
"""
|
||||
# Prepare the data for the DataFrame as a dict of lists
|
||||
d = defaultdict(list)
|
||||
index = []
|
||||
|
||||
for i, s in results.items():
|
||||
if s is None:
|
||||
p = Point()
|
||||
address = None
|
||||
|
||||
else:
|
||||
address, loc = s
|
||||
|
||||
# loc is lat, lon and we want lon, lat
|
||||
if loc is None:
|
||||
p = Point()
|
||||
else:
|
||||
p = Point(loc[1], loc[0])
|
||||
|
||||
d["geometry"].append(p)
|
||||
d["address"].append(address)
|
||||
index.append(i)
|
||||
|
||||
df = geopandas.GeoDataFrame(d, index=index, crs="EPSG:4326")
|
||||
|
||||
return df
|
||||
@@ -0,0 +1,188 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _hilbert_distance(geoms, total_bounds=None, level=16):
|
||||
"""
|
||||
Calculate the distance along a Hilbert curve.
|
||||
|
||||
The distances are calculated for the midpoints of the geometries in the
|
||||
GeoDataFrame.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
geoms : GeometryArray
|
||||
total_bounds : 4-element array
|
||||
Total bounds of geometries - array
|
||||
level : int (1 - 16), default 16
|
||||
Determines the precision of the curve (points on the curve will
|
||||
have coordinates in the range [0, 2^level - 1]).
|
||||
|
||||
Returns
|
||||
-------
|
||||
np.ndarray
|
||||
Array containing distances along the Hilbert curve
|
||||
|
||||
"""
|
||||
if geoms.is_empty.any() | geoms.isna().any():
|
||||
raise ValueError(
|
||||
"Hilbert distance cannot be computed on a GeoSeries with empty or "
|
||||
"missing geometries.",
|
||||
)
|
||||
# Calculate bounds as numpy array
|
||||
bounds = geoms.bounds
|
||||
|
||||
# Calculate discrete coords based on total bounds and bounds
|
||||
x, y = _continuous_to_discrete_coords(bounds, level, total_bounds)
|
||||
# Compute distance along hilbert curve
|
||||
distances = _encode(level, x, y)
|
||||
|
||||
return distances
|
||||
|
||||
|
||||
def _continuous_to_discrete_coords(bounds, level, total_bounds):
|
||||
"""
|
||||
Calculates mid points & ranges of geoms and returns
|
||||
as discrete coords
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
bounds : Bounds of each geometry - array
|
||||
|
||||
p : The number of iterations used in constructing the Hilbert curve
|
||||
|
||||
total_bounds : Total bounds of geometries - array
|
||||
|
||||
Returns
|
||||
-------
|
||||
Discrete two-dimensional numpy array
|
||||
Two-dimensional array Array of hilbert distances for each geom
|
||||
|
||||
"""
|
||||
# Hilbert Side length
|
||||
side_length = (2**level) - 1
|
||||
|
||||
# Calculate mid points for x and y bound coords - returns array
|
||||
x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0
|
||||
y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0
|
||||
|
||||
# Calculate x and y range of total bound coords - returns array
|
||||
if total_bounds is None:
|
||||
total_bounds = (
|
||||
np.nanmin(x_mids),
|
||||
np.nanmin(y_mids),
|
||||
np.nanmax(x_mids),
|
||||
np.nanmax(y_mids),
|
||||
)
|
||||
|
||||
xmin, ymin, xmax, ymax = total_bounds
|
||||
|
||||
# Transform continuous value to discrete integer for each dimension
|
||||
x_int = _continuous_to_discrete(x_mids, (xmin, xmax), side_length)
|
||||
y_int = _continuous_to_discrete(y_mids, (ymin, ymax), side_length)
|
||||
|
||||
return x_int, y_int
|
||||
|
||||
|
||||
def _continuous_to_discrete(vals, val_range, n):
|
||||
"""
|
||||
Convert a continuous one-dimensional array to discrete integer values
|
||||
based their ranges
|
||||
|
||||
Parameters
|
||||
----------
|
||||
vals : Array of continuous values
|
||||
|
||||
val_range : Tuple containing range of continuous values
|
||||
|
||||
n : Number of discrete values
|
||||
|
||||
Returns
|
||||
-------
|
||||
One-dimensional array of discrete ints
|
||||
|
||||
"""
|
||||
width = val_range[1] - val_range[0]
|
||||
if width == 0:
|
||||
return np.zeros_like(vals, dtype=np.uint32)
|
||||
res = (vals - val_range[0]) * (n / width)
|
||||
|
||||
np.clip(res, 0, n, out=res)
|
||||
return res.astype(np.uint32)
|
||||
|
||||
|
||||
# Fast Hilbert curve algorithm by http://threadlocalmutex.com/
|
||||
# From C++ https://github.com/rawrunprotected/hilbert_curves
|
||||
# (public domain)
|
||||
|
||||
|
||||
MAX_LEVEL = 16
|
||||
|
||||
|
||||
def _interleave(x):
|
||||
x = (x | (x << 8)) & 0x00FF00FF
|
||||
x = (x | (x << 4)) & 0x0F0F0F0F
|
||||
x = (x | (x << 2)) & 0x33333333
|
||||
x = (x | (x << 1)) & 0x55555555
|
||||
return x
|
||||
|
||||
|
||||
def _encode(level, x, y):
|
||||
x = np.asarray(x, dtype="uint32")
|
||||
y = np.asarray(y, dtype="uint32")
|
||||
|
||||
if level > MAX_LEVEL:
|
||||
raise ValueError("Level out of range")
|
||||
|
||||
x = x << (16 - level)
|
||||
y = y << (16 - level)
|
||||
|
||||
# Initial prefix scan round, prime with x and y
|
||||
a = x ^ y
|
||||
b = 0xFFFF ^ a
|
||||
c = 0xFFFF ^ (x | y)
|
||||
d = x & (y ^ 0xFFFF)
|
||||
|
||||
A = a | (b >> 1)
|
||||
B = (a >> 1) ^ a
|
||||
C = ((c >> 1) ^ (b & (d >> 1))) ^ c
|
||||
D = ((a & (c >> 1)) ^ (d >> 1)) ^ d
|
||||
|
||||
a = A.copy()
|
||||
b = B.copy()
|
||||
c = C.copy()
|
||||
d = D.copy()
|
||||
|
||||
A = (a & (a >> 2)) ^ (b & (b >> 2))
|
||||
B = (a & (b >> 2)) ^ (b & ((a ^ b) >> 2))
|
||||
C ^= (a & (c >> 2)) ^ (b & (d >> 2))
|
||||
D ^= (b & (c >> 2)) ^ ((a ^ b) & (d >> 2))
|
||||
|
||||
a = A.copy()
|
||||
b = B.copy()
|
||||
c = C.copy()
|
||||
d = D.copy()
|
||||
|
||||
A = (a & (a >> 4)) ^ (b & (b >> 4))
|
||||
B = (a & (b >> 4)) ^ (b & ((a ^ b) >> 4))
|
||||
C ^= (a & (c >> 4)) ^ (b & (d >> 4))
|
||||
D ^= (b & (c >> 4)) ^ ((a ^ b) & (d >> 4))
|
||||
|
||||
# Final round and projection
|
||||
a = A.copy()
|
||||
b = B.copy()
|
||||
c = C.copy()
|
||||
d = D.copy()
|
||||
|
||||
C ^= (a & (c >> 8)) ^ (b & (d >> 8))
|
||||
D ^= (b & (c >> 8)) ^ ((a ^ b) & (d >> 8))
|
||||
|
||||
# Undo transformation prefix scan
|
||||
a = C ^ (C >> 1)
|
||||
b = D ^ (D >> 1)
|
||||
|
||||
# Recover index bits
|
||||
i0 = x ^ y
|
||||
i1 = b | (0xFFFF ^ (i0 | a))
|
||||
|
||||
return ((_interleave(i1) << 1) | _interleave(i0)) >> (32 - 2 * level)
|
||||
@@ -0,0 +1,399 @@
|
||||
import warnings
|
||||
from functools import reduce
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from geopandas import GeoDataFrame, GeoSeries
|
||||
from geopandas._compat import PANDAS_GE_30
|
||||
from geopandas.array import _check_crs, _crs_mismatch_warn
|
||||
|
||||
|
||||
def _ensure_geometry_column(df):
|
||||
"""
|
||||
Helper function to ensure the geometry column is called 'geometry'.
|
||||
If another column with that name exists, it will be dropped.
|
||||
"""
|
||||
if not df._geometry_column_name == "geometry":
|
||||
if PANDAS_GE_30:
|
||||
if "geometry" in df.columns:
|
||||
df = df.drop("geometry", axis=1)
|
||||
df = df.rename_geometry("geometry")
|
||||
else:
|
||||
if "geometry" in df.columns:
|
||||
df.drop("geometry", axis=1, inplace=True)
|
||||
df.rename_geometry("geometry", inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def _overlay_intersection(df1, df2):
|
||||
"""
|
||||
Overlay Intersection operation used in overlay function
|
||||
"""
|
||||
# Spatial Index to create intersections
|
||||
idx1, idx2 = df2.sindex.query(df1.geometry, predicate="intersects", sort=True)
|
||||
# Create pairs of geometries in both dataframes to be intersected
|
||||
if idx1.size > 0 and idx2.size > 0:
|
||||
left = df1.geometry.take(idx1)
|
||||
left.reset_index(drop=True, inplace=True)
|
||||
right = df2.geometry.take(idx2)
|
||||
right.reset_index(drop=True, inplace=True)
|
||||
intersections = left.intersection(right)
|
||||
poly_ix = intersections.geom_type.isin(["Polygon", "MultiPolygon"])
|
||||
intersections.loc[poly_ix] = intersections[poly_ix].make_valid()
|
||||
|
||||
# only keep actual intersecting geometries
|
||||
pairs_intersect = pd.DataFrame({"__idx1": idx1, "__idx2": idx2})
|
||||
geom_intersect = intersections
|
||||
|
||||
# merge data for intersecting geometries
|
||||
df1 = df1.reset_index(drop=True)
|
||||
df2 = df2.reset_index(drop=True)
|
||||
dfinter = pairs_intersect.merge(
|
||||
df1.drop(df1._geometry_column_name, axis=1),
|
||||
left_on="__idx1",
|
||||
right_index=True,
|
||||
)
|
||||
dfinter = dfinter.merge(
|
||||
df2.drop(df2._geometry_column_name, axis=1),
|
||||
left_on="__idx2",
|
||||
right_index=True,
|
||||
suffixes=("_1", "_2"),
|
||||
)
|
||||
|
||||
return GeoDataFrame(dfinter, geometry=geom_intersect, crs=df1.crs)
|
||||
else:
|
||||
result = df1.iloc[:0].merge(
|
||||
df2.iloc[:0].drop(df2.geometry.name, axis=1),
|
||||
left_index=True,
|
||||
right_index=True,
|
||||
suffixes=("_1", "_2"),
|
||||
)
|
||||
result["__idx1"] = np.nan
|
||||
result["__idx2"] = np.nan
|
||||
return result[
|
||||
result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name]
|
||||
]
|
||||
|
||||
|
||||
def _overlay_difference(df1, df2):
|
||||
"""
|
||||
Overlay Difference operation used in overlay function
|
||||
"""
|
||||
# spatial index query to find intersections
|
||||
idx1, idx2 = df2.sindex.query(df1.geometry, predicate="intersects", sort=True)
|
||||
idx1_unique, idx1_unique_indices = np.unique(idx1, return_index=True)
|
||||
idx2_split = np.split(idx2, idx1_unique_indices[1:])
|
||||
sidx = [
|
||||
idx2_split.pop(0) if idx in idx1_unique else []
|
||||
for idx in range(df1.geometry.size)
|
||||
]
|
||||
# Create differences
|
||||
new_g = []
|
||||
for geom, neighbours in zip(df1.geometry, sidx):
|
||||
new = reduce(
|
||||
lambda x, y: x.difference(y), [geom] + list(df2.geometry.iloc[neighbours])
|
||||
)
|
||||
new_g.append(new)
|
||||
differences = GeoSeries(new_g, index=df1.index, crs=df1.crs)
|
||||
poly_ix = differences.geom_type.isin(["Polygon", "MultiPolygon"])
|
||||
differences.loc[poly_ix] = differences[poly_ix].make_valid()
|
||||
geom_diff = differences[~differences.is_empty].copy()
|
||||
dfdiff = df1[~differences.is_empty].copy()
|
||||
dfdiff[dfdiff._geometry_column_name] = geom_diff
|
||||
return dfdiff
|
||||
|
||||
|
||||
def _overlay_symmetric_diff(df1, df2):
|
||||
"""
|
||||
Overlay Symmetric Difference operation used in overlay function
|
||||
"""
|
||||
dfdiff1 = _overlay_difference(df1, df2)
|
||||
dfdiff2 = _overlay_difference(df2, df1)
|
||||
dfdiff1["__idx1"] = range(len(dfdiff1))
|
||||
dfdiff2["__idx2"] = range(len(dfdiff2))
|
||||
dfdiff1["__idx2"] = np.nan
|
||||
dfdiff2["__idx1"] = np.nan
|
||||
# ensure geometry name (otherwise merge goes wrong)
|
||||
dfdiff1 = _ensure_geometry_column(dfdiff1)
|
||||
dfdiff2 = _ensure_geometry_column(dfdiff2)
|
||||
# combine both 'difference' dataframes
|
||||
dfsym = dfdiff1.merge(
|
||||
dfdiff2, on=["__idx1", "__idx2"], how="outer", suffixes=("_1", "_2")
|
||||
)
|
||||
geometry = dfsym.geometry_1.copy()
|
||||
geometry.name = "geometry"
|
||||
# https://github.com/pandas-dev/pandas/issues/26468 use loc for now
|
||||
geometry.loc[dfsym.geometry_1.isnull()] = dfsym.loc[
|
||||
dfsym.geometry_1.isnull(), "geometry_2"
|
||||
]
|
||||
dfsym.drop(["geometry_1", "geometry_2"], axis=1, inplace=True)
|
||||
dfsym.reset_index(drop=True, inplace=True)
|
||||
dfsym = GeoDataFrame(dfsym, geometry=geometry, crs=df1.crs)
|
||||
return dfsym
|
||||
|
||||
|
||||
def _overlay_union(df1, df2):
|
||||
"""
|
||||
Overlay Union operation used in overlay function
|
||||
"""
|
||||
dfinter = _overlay_intersection(df1, df2)
|
||||
dfsym = _overlay_symmetric_diff(df1, df2)
|
||||
dfunion = pd.concat([dfinter, dfsym], ignore_index=True, sort=False)
|
||||
# keep geometry column last
|
||||
columns = list(dfunion.columns)
|
||||
columns.remove("geometry")
|
||||
columns.append("geometry")
|
||||
return dfunion.reindex(columns=columns)
|
||||
|
||||
|
||||
def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True):
|
||||
"""Perform spatial overlay between two GeoDataFrames.
|
||||
|
||||
Currently only supports data GeoDataFrames with uniform geometry types,
|
||||
i.e. containing only (Multi)Polygons, or only (Multi)Points, or a
|
||||
combination of (Multi)LineString and LinearRing shapes.
|
||||
Implements several methods that are all effectively subsets of the union.
|
||||
|
||||
See the User Guide page :doc:`../../user_guide/set_operations` for details.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df1 : GeoDataFrame
|
||||
df2 : GeoDataFrame
|
||||
how : string
|
||||
Method of spatial overlay: 'intersection', 'union',
|
||||
'identity', 'symmetric_difference' or 'difference'.
|
||||
keep_geom_type : bool
|
||||
If True, return only geometries of the same geometry type as df1 has,
|
||||
if False, return all resulting geometries. Default is None,
|
||||
which will set keep_geom_type to True but warn upon dropping
|
||||
geometries.
|
||||
make_valid : bool, default True
|
||||
If True, any invalid input geometries are corrected with a call to make_valid(),
|
||||
if False, a `ValueError` is raised if any input geometries are invalid.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : GeoDataFrame
|
||||
GeoDataFrame with new set of polygons and attributes
|
||||
resulting from the overlay
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from shapely.geometry import Polygon
|
||||
>>> polys1 = geopandas.GeoSeries([Polygon([(0,0), (2,0), (2,2), (0,2)]),
|
||||
... Polygon([(2,2), (4,2), (4,4), (2,4)])])
|
||||
>>> polys2 = geopandas.GeoSeries([Polygon([(1,1), (3,1), (3,3), (1,3)]),
|
||||
... Polygon([(3,3), (5,3), (5,5), (3,5)])])
|
||||
>>> df1 = geopandas.GeoDataFrame({'geometry': polys1, 'df1_data':[1,2]})
|
||||
>>> df2 = geopandas.GeoDataFrame({'geometry': polys2, 'df2_data':[1,2]})
|
||||
|
||||
>>> geopandas.overlay(df1, df2, how='union')
|
||||
df1_data df2_data geometry
|
||||
0 1.0 1.0 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
|
||||
1 2.0 1.0 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
|
||||
2 2.0 2.0 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
|
||||
3 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
|
||||
4 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
|
||||
5 NaN 1.0 MULTIPOLYGON (((2 3, 2 2, 1 2, 1 3, 2 3)), ((3...
|
||||
6 NaN 2.0 POLYGON ((3 5, 5 5, 5 3, 4 3, 4 4, 3 4, 3 5))
|
||||
|
||||
>>> geopandas.overlay(df1, df2, how='intersection')
|
||||
df1_data df2_data geometry
|
||||
0 1 1 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
|
||||
1 2 1 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
|
||||
2 2 2 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
|
||||
|
||||
>>> geopandas.overlay(df1, df2, how='symmetric_difference')
|
||||
df1_data df2_data geometry
|
||||
0 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
|
||||
1 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
|
||||
2 NaN 1.0 MULTIPOLYGON (((2 3, 2 2, 1 2, 1 3, 2 3)), ((3...
|
||||
3 NaN 2.0 POLYGON ((3 5, 5 5, 5 3, 4 3, 4 4, 3 4, 3 5))
|
||||
|
||||
>>> geopandas.overlay(df1, df2, how='difference')
|
||||
geometry df1_data
|
||||
0 POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0)) 1
|
||||
1 MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4... 2
|
||||
|
||||
>>> geopandas.overlay(df1, df2, how='identity')
|
||||
df1_data df2_data geometry
|
||||
0 1.0 1.0 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
|
||||
1 2.0 1.0 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
|
||||
2 2.0 2.0 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
|
||||
3 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
|
||||
4 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
|
||||
|
||||
See also
|
||||
--------
|
||||
sjoin : spatial join
|
||||
GeoDataFrame.overlay : equivalent method
|
||||
|
||||
Notes
|
||||
-----
|
||||
Every operation in GeoPandas is planar, i.e. the potential third
|
||||
dimension is not taken into account.
|
||||
"""
|
||||
# Allowed operations
|
||||
allowed_hows = [
|
||||
"intersection",
|
||||
"union",
|
||||
"identity",
|
||||
"symmetric_difference",
|
||||
"difference", # aka erase
|
||||
]
|
||||
# Error Messages
|
||||
if how not in allowed_hows:
|
||||
raise ValueError(
|
||||
"`how` was '{0}' but is expected to be in {1}".format(how, allowed_hows)
|
||||
)
|
||||
|
||||
if isinstance(df1, GeoSeries) or isinstance(df2, GeoSeries):
|
||||
raise NotImplementedError(
|
||||
"overlay currently only implemented for GeoDataFrames"
|
||||
)
|
||||
|
||||
if not _check_crs(df1, df2):
|
||||
_crs_mismatch_warn(df1, df2, stacklevel=3)
|
||||
|
||||
if keep_geom_type is None:
|
||||
keep_geom_type = True
|
||||
keep_geom_type_warning = True
|
||||
else:
|
||||
keep_geom_type_warning = False
|
||||
|
||||
polys = ["Polygon", "MultiPolygon"]
|
||||
lines = ["LineString", "MultiLineString", "LinearRing"]
|
||||
points = ["Point", "MultiPoint"]
|
||||
for i, df in enumerate([df1, df2]):
|
||||
poly_check = df.geom_type.isin(polys).any()
|
||||
lines_check = df.geom_type.isin(lines).any()
|
||||
points_check = df.geom_type.isin(points).any()
|
||||
if sum([poly_check, lines_check, points_check]) > 1:
|
||||
raise NotImplementedError(
|
||||
"df{} contains mixed geometry types.".format(i + 1)
|
||||
)
|
||||
|
||||
if how == "intersection":
|
||||
box_gdf1 = df1.total_bounds
|
||||
box_gdf2 = df2.total_bounds
|
||||
|
||||
if not (
|
||||
((box_gdf1[0] <= box_gdf2[2]) and (box_gdf2[0] <= box_gdf1[2]))
|
||||
and ((box_gdf1[1] <= box_gdf2[3]) and (box_gdf2[1] <= box_gdf1[3]))
|
||||
):
|
||||
result = df1.iloc[:0].merge(
|
||||
df2.iloc[:0].drop(df2.geometry.name, axis=1),
|
||||
left_index=True,
|
||||
right_index=True,
|
||||
suffixes=("_1", "_2"),
|
||||
)
|
||||
return result[
|
||||
result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name]
|
||||
]
|
||||
|
||||
# Computations
|
||||
def _make_valid(df):
|
||||
df = df.copy()
|
||||
if df.geom_type.isin(polys).all():
|
||||
mask = ~df.geometry.is_valid
|
||||
col = df._geometry_column_name
|
||||
if make_valid:
|
||||
df.loc[mask, col] = df.loc[mask, col].make_valid()
|
||||
elif mask.any():
|
||||
raise ValueError(
|
||||
"You have passed make_valid=False along with "
|
||||
f"{mask.sum()} invalid input geometries. "
|
||||
"Use make_valid=True or make sure that all geometries "
|
||||
"are valid before using overlay."
|
||||
)
|
||||
return df
|
||||
|
||||
df1 = _make_valid(df1)
|
||||
df2 = _make_valid(df2)
|
||||
|
||||
with warnings.catch_warnings(): # CRS checked above, suppress array-level warning
|
||||
warnings.filterwarnings("ignore", message="CRS mismatch between the CRS")
|
||||
if how == "difference":
|
||||
result = _overlay_difference(df1, df2)
|
||||
elif how == "intersection":
|
||||
result = _overlay_intersection(df1, df2)
|
||||
elif how == "symmetric_difference":
|
||||
result = _overlay_symmetric_diff(df1, df2)
|
||||
elif how == "union":
|
||||
result = _overlay_union(df1, df2)
|
||||
elif how == "identity":
|
||||
dfunion = _overlay_union(df1, df2)
|
||||
result = dfunion[dfunion["__idx1"].notnull()].copy()
|
||||
|
||||
if how in ["intersection", "symmetric_difference", "union", "identity"]:
|
||||
result.drop(["__idx1", "__idx2"], axis=1, inplace=True)
|
||||
|
||||
if keep_geom_type:
|
||||
geom_type = df1.geom_type.iloc[0]
|
||||
|
||||
# First we filter the geometry types inside GeometryCollections objects
|
||||
# (e.g. GeometryCollection([polygon, point]) -> polygon)
|
||||
# we do this separately on only the relevant rows, as this is an expensive
|
||||
# operation (an expensive no-op for geometry types other than collections)
|
||||
is_collection = result.geom_type == "GeometryCollection"
|
||||
if is_collection.any():
|
||||
geom_col = result._geometry_column_name
|
||||
collections = result[[geom_col]][is_collection]
|
||||
|
||||
exploded = collections.reset_index(drop=True).explode(index_parts=True)
|
||||
exploded = exploded.reset_index(level=0)
|
||||
|
||||
orig_num_geoms_exploded = exploded.shape[0]
|
||||
if geom_type in polys:
|
||||
exploded.loc[~exploded.geom_type.isin(polys), geom_col] = None
|
||||
elif geom_type in lines:
|
||||
exploded.loc[~exploded.geom_type.isin(lines), geom_col] = None
|
||||
elif geom_type in points:
|
||||
exploded.loc[~exploded.geom_type.isin(points), geom_col] = None
|
||||
else:
|
||||
raise TypeError(
|
||||
"`keep_geom_type` does not support {}.".format(geom_type)
|
||||
)
|
||||
num_dropped_collection = (
|
||||
orig_num_geoms_exploded - exploded.geometry.isna().sum()
|
||||
)
|
||||
|
||||
# level_0 created with above reset_index operation
|
||||
# and represents the original geometry collections
|
||||
# TODO avoiding dissolve to call union_all in this case could further
|
||||
# improve performance (we only need to collect geometries in their
|
||||
# respective Multi version)
|
||||
dissolved = exploded.dissolve(by="level_0")
|
||||
result.loc[is_collection, geom_col] = dissolved[geom_col].values
|
||||
else:
|
||||
num_dropped_collection = 0
|
||||
|
||||
# Now we filter all geometries (in theory we don't need to do this
|
||||
# again for the rows handled above for GeometryCollections, but filtering
|
||||
# them out is probably more expensive as simply including them when this
|
||||
# is typically about only a few rows)
|
||||
orig_num_geoms = result.shape[0]
|
||||
if geom_type in polys:
|
||||
result = result.loc[result.geom_type.isin(polys)]
|
||||
elif geom_type in lines:
|
||||
result = result.loc[result.geom_type.isin(lines)]
|
||||
elif geom_type in points:
|
||||
result = result.loc[result.geom_type.isin(points)]
|
||||
else:
|
||||
raise TypeError("`keep_geom_type` does not support {}.".format(geom_type))
|
||||
num_dropped = orig_num_geoms - result.shape[0]
|
||||
|
||||
if (num_dropped > 0 or num_dropped_collection > 0) and keep_geom_type_warning:
|
||||
warnings.warn(
|
||||
"`keep_geom_type=True` in overlay resulted in {} dropped "
|
||||
"geometries of different geometry types than df1 has. "
|
||||
"Set `keep_geom_type=False` to retain all "
|
||||
"geometries".format(num_dropped + num_dropped_collection),
|
||||
UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
result.reset_index(drop=True, inplace=True)
|
||||
return result
|
||||
@@ -0,0 +1,734 @@
|
||||
import warnings
|
||||
from functools import partial
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from geopandas import GeoDataFrame
|
||||
from geopandas._compat import PANDAS_GE_30
|
||||
from geopandas.array import _check_crs, _crs_mismatch_warn
|
||||
|
||||
|
||||
def sjoin(
|
||||
left_df,
|
||||
right_df,
|
||||
how="inner",
|
||||
predicate="intersects",
|
||||
lsuffix="left",
|
||||
rsuffix="right",
|
||||
distance=None,
|
||||
on_attribute=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Spatial join of two GeoDataFrames.
|
||||
|
||||
See the User Guide page :doc:`../../user_guide/mergingdata` for details.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left_df, right_df : GeoDataFrames
|
||||
how : string, default 'inner'
|
||||
The type of join:
|
||||
|
||||
* 'left': use keys from left_df; retain only left_df geometry column
|
||||
* 'right': use keys from right_df; retain only right_df geometry column
|
||||
* 'inner': use intersection of keys from both dfs; retain only
|
||||
left_df geometry column
|
||||
predicate : string, default 'intersects'
|
||||
Binary predicate. Valid values are determined by the spatial index used.
|
||||
You can check the valid values in left_df or right_df as
|
||||
``left_df.sindex.valid_query_predicates`` or
|
||||
``right_df.sindex.valid_query_predicates``
|
||||
Replaces deprecated ``op`` parameter.
|
||||
lsuffix : string, default 'left'
|
||||
Suffix to apply to overlapping column names (left GeoDataFrame).
|
||||
rsuffix : string, default 'right'
|
||||
Suffix to apply to overlapping column names (right GeoDataFrame).
|
||||
distance : number or array_like, optional
|
||||
Distance(s) around each input geometry within which to query the tree
|
||||
for the 'dwithin' predicate. If array_like, must be
|
||||
one-dimesional with length equal to length of left GeoDataFrame.
|
||||
Required if ``predicate='dwithin'``.
|
||||
on_attribute : string, list or tuple
|
||||
Column name(s) to join on as an additional join restriction on top
|
||||
of the spatial predicate. These must be found in both DataFrames.
|
||||
If set, observations are joined only if the predicate applies
|
||||
and values in specified columns match.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import geodatasets
|
||||
>>> chicago = geopandas.read_file(
|
||||
... geodatasets.get_path("geoda.chicago_health")
|
||||
... )
|
||||
>>> groceries = geopandas.read_file(
|
||||
... geodatasets.get_path("geoda.groceries")
|
||||
... ).to_crs(chicago.crs)
|
||||
|
||||
>>> chicago.head() # doctest: +SKIP
|
||||
ComAreaID ... geometry
|
||||
0 35 ... POLYGON ((-87.60914 41.84469, -87.60915 41.844...
|
||||
1 36 ... POLYGON ((-87.59215 41.81693, -87.59231 41.816...
|
||||
2 37 ... POLYGON ((-87.62880 41.80189, -87.62879 41.801...
|
||||
3 38 ... POLYGON ((-87.60671 41.81681, -87.60670 41.816...
|
||||
4 39 ... POLYGON ((-87.59215 41.81693, -87.59215 41.816...
|
||||
[5 rows x 87 columns]
|
||||
|
||||
>>> groceries.head() # doctest: +SKIP
|
||||
OBJECTID Ycoord ... Category geometry
|
||||
0 16 41.973266 ... NaN MULTIPOINT (-87.65661 41.97321)
|
||||
1 18 41.696367 ... NaN MULTIPOINT (-87.68136 41.69713)
|
||||
2 22 41.868634 ... NaN MULTIPOINT (-87.63918 41.86847)
|
||||
3 23 41.877590 ... new MULTIPOINT (-87.65495 41.87783)
|
||||
4 27 41.737696 ... NaN MULTIPOINT (-87.62715 41.73623)
|
||||
[5 rows x 8 columns]
|
||||
|
||||
>>> groceries_w_communities = geopandas.sjoin(groceries, chicago)
|
||||
>>> groceries_w_communities.head() # doctest: +SKIP
|
||||
OBJECTID community geometry
|
||||
0 16 UPTOWN MULTIPOINT ((-87.65661 41.97321))
|
||||
1 18 MORGAN PARK MULTIPOINT ((-87.68136 41.69713))
|
||||
2 22 NEAR WEST SIDE MULTIPOINT ((-87.63918 41.86847))
|
||||
3 23 NEAR WEST SIDE MULTIPOINT ((-87.65495 41.87783))
|
||||
4 27 CHATHAM MULTIPOINT ((-87.62715 41.73623))
|
||||
[5 rows x 95 columns]
|
||||
|
||||
See also
|
||||
--------
|
||||
overlay : overlay operation resulting in a new geometry
|
||||
GeoDataFrame.sjoin : equivalent method
|
||||
|
||||
Notes
|
||||
-----
|
||||
Every operation in GeoPandas is planar, i.e. the potential third
|
||||
dimension is not taken into account.
|
||||
"""
|
||||
if kwargs:
|
||||
first = next(iter(kwargs.keys()))
|
||||
raise TypeError(f"sjoin() got an unexpected keyword argument '{first}'")
|
||||
|
||||
on_attribute = _maybe_make_list(on_attribute)
|
||||
|
||||
_basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=on_attribute),
|
||||
|
||||
indices = _geom_predicate_query(
|
||||
left_df, right_df, predicate, distance, on_attribute=on_attribute
|
||||
)
|
||||
|
||||
joined, _ = _frame_join(
|
||||
left_df,
|
||||
right_df,
|
||||
indices,
|
||||
None,
|
||||
how,
|
||||
lsuffix,
|
||||
rsuffix,
|
||||
predicate,
|
||||
on_attribute=on_attribute,
|
||||
)
|
||||
|
||||
return joined
|
||||
|
||||
|
||||
def _maybe_make_list(obj):
|
||||
if isinstance(obj, tuple):
|
||||
return list(obj)
|
||||
if obj is not None and not isinstance(obj, list):
|
||||
return [obj]
|
||||
return obj
|
||||
|
||||
|
||||
def _basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=None):
|
||||
"""Checks the validity of join input parameters.
|
||||
|
||||
`how` must be one of the valid options.
|
||||
`'index_'` concatenated with `lsuffix` or `rsuffix` must not already
|
||||
exist as columns in the left or right data frames.
|
||||
|
||||
Parameters
|
||||
------------
|
||||
left_df : GeoDataFrame
|
||||
right_df : GeoData Frame
|
||||
how : str, one of 'left', 'right', 'inner'
|
||||
join type
|
||||
lsuffix : str
|
||||
left index suffix
|
||||
rsuffix : str
|
||||
right index suffix
|
||||
on_attribute : list, default None
|
||||
list of column names to merge on along with geometry
|
||||
"""
|
||||
if not isinstance(left_df, GeoDataFrame):
|
||||
raise ValueError(
|
||||
"'left_df' should be GeoDataFrame, got {}".format(type(left_df))
|
||||
)
|
||||
|
||||
if not isinstance(right_df, GeoDataFrame):
|
||||
raise ValueError(
|
||||
"'right_df' should be GeoDataFrame, got {}".format(type(right_df))
|
||||
)
|
||||
|
||||
allowed_hows = ["left", "right", "inner"]
|
||||
if how not in allowed_hows:
|
||||
raise ValueError(
|
||||
'`how` was "{}" but is expected to be in {}'.format(how, allowed_hows)
|
||||
)
|
||||
|
||||
if not _check_crs(left_df, right_df):
|
||||
_crs_mismatch_warn(left_df, right_df, stacklevel=4)
|
||||
|
||||
if on_attribute:
|
||||
for attr in on_attribute:
|
||||
if (attr not in left_df) and (attr not in right_df):
|
||||
raise ValueError(
|
||||
f"Expected column {attr} is missing from both of the dataframes."
|
||||
)
|
||||
if attr not in left_df:
|
||||
raise ValueError(
|
||||
f"Expected column {attr} is missing from the left dataframe."
|
||||
)
|
||||
if attr not in right_df:
|
||||
raise ValueError(
|
||||
f"Expected column {attr} is missing from the right dataframe."
|
||||
)
|
||||
if attr in (left_df.geometry.name, right_df.geometry.name):
|
||||
raise ValueError(
|
||||
"Active geometry column cannot be used as an input "
|
||||
"for on_attribute parameter."
|
||||
)
|
||||
|
||||
|
||||
def _geom_predicate_query(left_df, right_df, predicate, distance, on_attribute=None):
|
||||
"""Compute geometric comparisons and get matching indices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left_df : GeoDataFrame
|
||||
right_df : GeoDataFrame
|
||||
predicate : string
|
||||
Binary predicate to query.
|
||||
on_attribute: list, default None
|
||||
list of column names to merge on along with geometry
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
DataFrame with matching indices in
|
||||
columns named `_key_left` and `_key_right`.
|
||||
"""
|
||||
|
||||
original_predicate = predicate
|
||||
|
||||
if predicate == "within":
|
||||
# within is implemented as the inverse of contains
|
||||
# contains is a faster predicate
|
||||
# see discussion at https://github.com/geopandas/geopandas/pull/1421
|
||||
predicate = "contains"
|
||||
sindex = left_df.sindex
|
||||
input_geoms = right_df.geometry
|
||||
else:
|
||||
# all other predicates are symmetric
|
||||
# keep them the same
|
||||
sindex = right_df.sindex
|
||||
input_geoms = left_df.geometry
|
||||
|
||||
if sindex:
|
||||
l_idx, r_idx = sindex.query(
|
||||
input_geoms, predicate=predicate, sort=False, distance=distance
|
||||
)
|
||||
else:
|
||||
# when sindex is empty / has no valid geometries
|
||||
l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
|
||||
|
||||
if original_predicate == "within":
|
||||
# within is implemented as the inverse of contains
|
||||
# flip back the results
|
||||
r_idx, l_idx = l_idx, r_idx
|
||||
indexer = np.lexsort((r_idx, l_idx))
|
||||
l_idx = l_idx[indexer]
|
||||
r_idx = r_idx[indexer]
|
||||
|
||||
if on_attribute:
|
||||
for attr in on_attribute:
|
||||
(l_idx, r_idx), _ = _filter_shared_attribute(
|
||||
left_df, right_df, l_idx, r_idx, attr
|
||||
)
|
||||
|
||||
return l_idx, r_idx
|
||||
|
||||
|
||||
def _reset_index_with_suffix(df, suffix, other):
|
||||
"""
|
||||
Equivalent of df.reset_index(), but with adding 'suffix' to auto-generated
|
||||
column names.
|
||||
"""
|
||||
index_original = df.index.names
|
||||
if PANDAS_GE_30:
|
||||
df_reset = df.reset_index()
|
||||
else:
|
||||
# we already made a copy of the dataframe in _frame_join before getting here
|
||||
df_reset = df
|
||||
df_reset.reset_index(inplace=True)
|
||||
column_names = df_reset.columns.to_numpy(copy=True)
|
||||
for i, label in enumerate(index_original):
|
||||
# if the original label was None, add suffix to auto-generated name
|
||||
if label is None:
|
||||
new_label = column_names[i]
|
||||
if "level" in new_label:
|
||||
# reset_index of MultiIndex gives "level_i" names, preserve the "i"
|
||||
lev = new_label.split("_")[1]
|
||||
new_label = f"index_{suffix}{lev}"
|
||||
else:
|
||||
new_label = f"index_{suffix}"
|
||||
# check new label will not be in other dataframe
|
||||
if new_label in df.columns or new_label in other.columns:
|
||||
raise ValueError(
|
||||
"'{0}' cannot be a column name in the frames being"
|
||||
" joined".format(new_label)
|
||||
)
|
||||
column_names[i] = new_label
|
||||
return df_reset, pd.Index(column_names)
|
||||
|
||||
|
||||
def _process_column_names_with_suffix(
|
||||
left: pd.Index, right: pd.Index, suffixes, left_df, right_df
|
||||
):
|
||||
"""
|
||||
Add suffixes to overlapping labels (ignoring the geometry column).
|
||||
|
||||
This is based on pandas' merge logic at https://github.com/pandas-dev/pandas/blob/
|
||||
a0779adb183345a8eb4be58b3ad00c223da58768/pandas/core/reshape/merge.py#L2300-L2370
|
||||
"""
|
||||
to_rename = left.intersection(right)
|
||||
if len(to_rename) == 0:
|
||||
return left, right
|
||||
|
||||
lsuffix, rsuffix = suffixes
|
||||
|
||||
if not lsuffix and not rsuffix:
|
||||
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
|
||||
|
||||
def renamer(x, suffix, geometry):
|
||||
if x in to_rename and x != geometry and suffix is not None:
|
||||
return f"{x}_{suffix}"
|
||||
return x
|
||||
|
||||
lrenamer = partial(
|
||||
renamer,
|
||||
suffix=lsuffix,
|
||||
geometry=getattr(left_df, "_geometry_column_name", None),
|
||||
)
|
||||
rrenamer = partial(
|
||||
renamer,
|
||||
suffix=rsuffix,
|
||||
geometry=getattr(right_df, "_geometry_column_name", None),
|
||||
)
|
||||
|
||||
# TODO retain index name?
|
||||
left_renamed = pd.Index([lrenamer(lab) for lab in left])
|
||||
right_renamed = pd.Index([rrenamer(lab) for lab in right])
|
||||
|
||||
dups = []
|
||||
if not left_renamed.is_unique:
|
||||
# Only warn when duplicates are caused because of suffixes, already duplicated
|
||||
# columns in origin should not warn
|
||||
dups = left_renamed[(left_renamed.duplicated()) & (~left.duplicated())].tolist()
|
||||
if not right_renamed.is_unique:
|
||||
dups.extend(
|
||||
right_renamed[(right_renamed.duplicated()) & (~right.duplicated())].tolist()
|
||||
)
|
||||
# TODO turn this into an error (pandas has done so as well)
|
||||
if dups:
|
||||
warnings.warn(
|
||||
f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
|
||||
f"result is deprecated and will raise a MergeError in a future version.",
|
||||
FutureWarning,
|
||||
stacklevel=4,
|
||||
)
|
||||
|
||||
return left_renamed, right_renamed
|
||||
|
||||
|
||||
def _restore_index(joined, index_names, index_names_original):
|
||||
"""
|
||||
Set back the the original index columns, and restoring their name as `None`
|
||||
if they didn't have a name originally.
|
||||
"""
|
||||
if PANDAS_GE_30:
|
||||
joined = joined.set_index(list(index_names))
|
||||
else:
|
||||
joined.set_index(list(index_names), inplace=True)
|
||||
|
||||
# restore the fact that the index didn't have a name
|
||||
joined_index_names = list(joined.index.names)
|
||||
for i, label in enumerate(index_names_original):
|
||||
if label is None:
|
||||
joined_index_names[i] = None
|
||||
joined.index.names = joined_index_names
|
||||
return joined
|
||||
|
||||
|
||||
def _adjust_indexers(indices, distances, original_length, how, predicate):
|
||||
"""
|
||||
The left/right indexers from the query represents an inner join.
|
||||
For a left or right join, we need to adjust them to include the rows
|
||||
that would not be present in an inner join.
|
||||
"""
|
||||
# the indices represent an inner join, no adjustment needed
|
||||
if how == "inner":
|
||||
return indices, distances
|
||||
|
||||
l_idx, r_idx = indices
|
||||
|
||||
if how == "right":
|
||||
# re-sort so it is sorted by the right indexer
|
||||
indexer = np.lexsort((l_idx, r_idx))
|
||||
l_idx, r_idx = l_idx[indexer], r_idx[indexer]
|
||||
if distances is not None:
|
||||
distances = distances[indexer]
|
||||
|
||||
# switch order
|
||||
r_idx, l_idx = l_idx, r_idx
|
||||
|
||||
# determine which indices are missing and where they would need to be inserted
|
||||
idx = np.arange(original_length)
|
||||
l_idx_missing = idx[~np.isin(idx, l_idx)]
|
||||
insert_idx = np.searchsorted(l_idx, l_idx_missing)
|
||||
# for the left indexer, insert those missing indices
|
||||
l_idx = np.insert(l_idx, insert_idx, l_idx_missing)
|
||||
# for the right indexer, insert -1 -> to get missing values in pandas' reindexing
|
||||
r_idx = np.insert(r_idx, insert_idx, -1)
|
||||
# for the indices, already insert those missing values manually
|
||||
if distances is not None:
|
||||
distances = np.insert(distances, insert_idx, np.nan)
|
||||
|
||||
if how == "right":
|
||||
# switch back
|
||||
l_idx, r_idx = r_idx, l_idx
|
||||
|
||||
return (l_idx, r_idx), distances
|
||||
|
||||
|
||||
def _frame_join(
|
||||
left_df,
|
||||
right_df,
|
||||
indices,
|
||||
distances,
|
||||
how,
|
||||
lsuffix,
|
||||
rsuffix,
|
||||
predicate,
|
||||
on_attribute=None,
|
||||
):
|
||||
"""Join the GeoDataFrames at the DataFrame level.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left_df : GeoDataFrame
|
||||
right_df : GeoDataFrame
|
||||
indices : tuple of ndarray
|
||||
Indices returned by the geometric join. Tuple with with integer
|
||||
indices representing the matches from `left_df` and `right_df`
|
||||
respectively.
|
||||
distances : ndarray, optional
|
||||
Passed trough and adapted based on the indices, if needed.
|
||||
how : string
|
||||
The type of join to use on the DataFrame level.
|
||||
lsuffix : string
|
||||
Suffix to apply to overlapping column names (left GeoDataFrame).
|
||||
rsuffix : string
|
||||
Suffix to apply to overlapping column names (right GeoDataFrame).
|
||||
on_attribute: list, default None
|
||||
list of column names to merge on along with geometry
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
Joined GeoDataFrame.
|
||||
"""
|
||||
if on_attribute: # avoid renaming or duplicating shared column
|
||||
right_df = right_df.drop(on_attribute, axis=1)
|
||||
|
||||
if how in ("inner", "left"):
|
||||
right_df = right_df.drop(right_df.geometry.name, axis=1)
|
||||
else: # how == 'right':
|
||||
left_df = left_df.drop(left_df.geometry.name, axis=1)
|
||||
|
||||
left_df = left_df.copy(deep=False)
|
||||
left_nlevels = left_df.index.nlevels
|
||||
left_index_original = left_df.index.names
|
||||
left_df, left_column_names = _reset_index_with_suffix(left_df, lsuffix, right_df)
|
||||
|
||||
right_df = right_df.copy(deep=False)
|
||||
right_nlevels = right_df.index.nlevels
|
||||
right_index_original = right_df.index.names
|
||||
right_df, right_column_names = _reset_index_with_suffix(right_df, rsuffix, left_df)
|
||||
|
||||
# if conflicting names in left and right, add suffix
|
||||
left_column_names, right_column_names = _process_column_names_with_suffix(
|
||||
left_column_names,
|
||||
right_column_names,
|
||||
(lsuffix, rsuffix),
|
||||
left_df,
|
||||
right_df,
|
||||
)
|
||||
left_df.columns = left_column_names
|
||||
right_df.columns = right_column_names
|
||||
left_index = left_df.columns[:left_nlevels]
|
||||
right_index = right_df.columns[:right_nlevels]
|
||||
|
||||
# perform join on the dataframes
|
||||
original_length = len(right_df) if how == "right" else len(left_df)
|
||||
(l_idx, r_idx), distances = _adjust_indexers(
|
||||
indices, distances, original_length, how, predicate
|
||||
)
|
||||
# the `take` method doesn't allow introducing NaNs with -1 indices
|
||||
# left = left_df.take(l_idx)
|
||||
# therefore we are using the private _reindex_with_indexers as workaround
|
||||
new_index = pd.RangeIndex(len(l_idx))
|
||||
left = left_df._reindex_with_indexers({0: (new_index, l_idx)})
|
||||
right = right_df._reindex_with_indexers({0: (new_index, r_idx)})
|
||||
if PANDAS_GE_30:
|
||||
kwargs = {}
|
||||
else:
|
||||
kwargs = dict(copy=False)
|
||||
joined = pd.concat([left, right], axis=1, **kwargs)
|
||||
|
||||
if how in ("inner", "left"):
|
||||
joined = _restore_index(joined, left_index, left_index_original)
|
||||
else: # how == 'right':
|
||||
joined = joined.set_geometry(right_df.geometry.name)
|
||||
joined = _restore_index(joined, right_index, right_index_original)
|
||||
|
||||
return joined, distances
|
||||
|
||||
|
||||
def _nearest_query(
|
||||
left_df: GeoDataFrame,
|
||||
right_df: GeoDataFrame,
|
||||
max_distance: float,
|
||||
how: str,
|
||||
return_distance: bool,
|
||||
exclusive: bool,
|
||||
on_attribute: Optional[list] = None,
|
||||
):
|
||||
# use the opposite of the join direction for the index
|
||||
use_left_as_sindex = how == "right"
|
||||
if use_left_as_sindex:
|
||||
sindex = left_df.sindex
|
||||
query = right_df.geometry
|
||||
else:
|
||||
sindex = right_df.sindex
|
||||
query = left_df.geometry
|
||||
if sindex:
|
||||
res = sindex.nearest(
|
||||
query,
|
||||
return_all=True,
|
||||
max_distance=max_distance,
|
||||
return_distance=return_distance,
|
||||
exclusive=exclusive,
|
||||
)
|
||||
if return_distance:
|
||||
(input_idx, tree_idx), distances = res
|
||||
else:
|
||||
(input_idx, tree_idx) = res
|
||||
distances = None
|
||||
if use_left_as_sindex:
|
||||
l_idx, r_idx = tree_idx, input_idx
|
||||
sort_order = np.argsort(l_idx, kind="stable")
|
||||
l_idx, r_idx = l_idx[sort_order], r_idx[sort_order]
|
||||
if distances is not None:
|
||||
distances = distances[sort_order]
|
||||
else:
|
||||
l_idx, r_idx = input_idx, tree_idx
|
||||
else:
|
||||
# when sindex is empty / has no valid geometries
|
||||
l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
|
||||
if return_distance:
|
||||
distances = np.array([], dtype=np.float64)
|
||||
else:
|
||||
distances = None
|
||||
|
||||
if on_attribute:
|
||||
for attr in on_attribute:
|
||||
(l_idx, r_idx), shared_attribute_rows = _filter_shared_attribute(
|
||||
left_df, right_df, l_idx, r_idx, attr
|
||||
)
|
||||
distances = distances[shared_attribute_rows]
|
||||
|
||||
return (l_idx, r_idx), distances
|
||||
|
||||
|
||||
def _filter_shared_attribute(left_df, right_df, l_idx, r_idx, attribute):
|
||||
"""
|
||||
Returns the indices for the left and right dataframe that share the same entry
|
||||
in the attribute column. Also returns a Boolean `shared_attribute_rows` for rows
|
||||
with the same entry.
|
||||
"""
|
||||
shared_attribute_rows = (
|
||||
left_df[attribute].iloc[l_idx].values == right_df[attribute].iloc[r_idx].values
|
||||
)
|
||||
|
||||
l_idx = l_idx[shared_attribute_rows]
|
||||
r_idx = r_idx[shared_attribute_rows]
|
||||
return (l_idx, r_idx), shared_attribute_rows
|
||||
|
||||
|
||||
def sjoin_nearest(
|
||||
left_df: GeoDataFrame,
|
||||
right_df: GeoDataFrame,
|
||||
how: str = "inner",
|
||||
max_distance: Optional[float] = None,
|
||||
lsuffix: str = "left",
|
||||
rsuffix: str = "right",
|
||||
distance_col: Optional[str] = None,
|
||||
exclusive: bool = False,
|
||||
) -> GeoDataFrame:
|
||||
"""Spatial join of two GeoDataFrames based on the distance between their geometries.
|
||||
|
||||
Results will include multiple output records for a single input record
|
||||
where there are multiple equidistant nearest or intersected neighbors.
|
||||
|
||||
Distance is calculated in CRS units and can be returned using the
|
||||
`distance_col` parameter.
|
||||
|
||||
See the User Guide page
|
||||
https://geopandas.readthedocs.io/en/latest/docs/user_guide/mergingdata.html
|
||||
for more details.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
left_df, right_df : GeoDataFrames
|
||||
how : string, default 'inner'
|
||||
The type of join:
|
||||
|
||||
* 'left': use keys from left_df; retain only left_df geometry column
|
||||
* 'right': use keys from right_df; retain only right_df geometry column
|
||||
* 'inner': use intersection of keys from both dfs; retain only
|
||||
left_df geometry column
|
||||
max_distance : float, default None
|
||||
Maximum distance within which to query for nearest geometry.
|
||||
Must be greater than 0.
|
||||
The max_distance used to search for nearest items in the tree may have a
|
||||
significant impact on performance by reducing the number of input
|
||||
geometries that are evaluated for nearest items in the tree.
|
||||
lsuffix : string, default 'left'
|
||||
Suffix to apply to overlapping column names (left GeoDataFrame).
|
||||
rsuffix : string, default 'right'
|
||||
Suffix to apply to overlapping column names (right GeoDataFrame).
|
||||
distance_col : string, default None
|
||||
If set, save the distances computed between matching geometries under a
|
||||
column of this name in the joined GeoDataFrame.
|
||||
exclusive : bool, default False
|
||||
If True, the nearest geometries that are equal to the input geometry
|
||||
will not be returned, default False.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import geodatasets
|
||||
>>> groceries = geopandas.read_file(
|
||||
... geodatasets.get_path("geoda.groceries")
|
||||
... )
|
||||
>>> chicago = geopandas.read_file(
|
||||
... geodatasets.get_path("geoda.chicago_health")
|
||||
... ).to_crs(groceries.crs)
|
||||
|
||||
>>> chicago.head() # doctest: +SKIP
|
||||
ComAreaID ... geometry
|
||||
0 35 ... POLYGON ((-87.60914 41.84469, -87.60915 41.844...
|
||||
1 36 ... POLYGON ((-87.59215 41.81693, -87.59231 41.816...
|
||||
2 37 ... POLYGON ((-87.62880 41.80189, -87.62879 41.801...
|
||||
3 38 ... POLYGON ((-87.60671 41.81681, -87.60670 41.816...
|
||||
4 39 ... POLYGON ((-87.59215 41.81693, -87.59215 41.816...
|
||||
[5 rows x 87 columns]
|
||||
|
||||
>>> groceries.head() # doctest: +SKIP
|
||||
OBJECTID Ycoord ... Category geometry
|
||||
0 16 41.973266 ... NaN MULTIPOINT ((-87.65661 41.97321))
|
||||
1 18 41.696367 ... NaN MULTIPOINT ((-87.68136 41.69713))
|
||||
2 22 41.868634 ... NaN MULTIPOINT ((-87.63918 41.86847))
|
||||
3 23 41.877590 ... new MULTIPOINT ((-87.65495 41.87783))
|
||||
4 27 41.737696 ... NaN MULTIPOINT ((-87.62715 41.73623))
|
||||
[5 rows x 8 columns]
|
||||
|
||||
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago)
|
||||
>>> groceries_w_communities[["Chain", "community", "geometry"]].head(2)
|
||||
Chain community geometry
|
||||
0 VIET HOA PLAZA UPTOWN MULTIPOINT ((1168268.672 1933554.35))
|
||||
1 COUNTY FAIR FOODS MORGAN PARK MULTIPOINT ((1162302.618 1832900.224))
|
||||
|
||||
|
||||
To include the distances:
|
||||
|
||||
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago, \
|
||||
distance_col="distances")
|
||||
>>> groceries_w_communities[["Chain", "community", \
|
||||
"distances"]].head(2)
|
||||
Chain community distances
|
||||
0 VIET HOA PLAZA UPTOWN 0.0
|
||||
1 COUNTY FAIR FOODS MORGAN PARK 0.0
|
||||
|
||||
In the following example, we get multiple groceries for Uptown because all
|
||||
results are equidistant (in this case zero because they intersect).
|
||||
In fact, we get 4 results in total:
|
||||
|
||||
>>> chicago_w_groceries = geopandas.sjoin_nearest(groceries, chicago, \
|
||||
distance_col="distances", how="right")
|
||||
>>> uptown_results = \
|
||||
chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
|
||||
>>> uptown_results[["Chain", "community"]]
|
||||
Chain community
|
||||
30 VIET HOA PLAZA UPTOWN
|
||||
30 JEWEL OSCO UPTOWN
|
||||
30 TARGET UPTOWN
|
||||
30 Mariano's UPTOWN
|
||||
|
||||
See also
|
||||
--------
|
||||
sjoin : binary predicate joins
|
||||
GeoDataFrame.sjoin_nearest : equivalent method
|
||||
|
||||
Notes
|
||||
-----
|
||||
Since this join relies on distances, results will be inaccurate
|
||||
if your geometries are in a geographic CRS.
|
||||
|
||||
Every operation in GeoPandas is planar, i.e. the potential third
|
||||
dimension is not taken into account.
|
||||
"""
|
||||
|
||||
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
|
||||
|
||||
left_df.geometry.values.check_geographic_crs(stacklevel=1)
|
||||
right_df.geometry.values.check_geographic_crs(stacklevel=1)
|
||||
|
||||
return_distance = distance_col is not None
|
||||
|
||||
indices, distances = _nearest_query(
|
||||
left_df,
|
||||
right_df,
|
||||
max_distance,
|
||||
how,
|
||||
return_distance,
|
||||
exclusive,
|
||||
)
|
||||
joined, distances = _frame_join(
|
||||
left_df,
|
||||
right_df,
|
||||
indices,
|
||||
distances,
|
||||
how,
|
||||
lsuffix,
|
||||
rsuffix,
|
||||
None,
|
||||
)
|
||||
|
||||
if return_distance:
|
||||
joined[distance_col] = distances
|
||||
|
||||
return joined
|
||||
@@ -0,0 +1,484 @@
|
||||
"""Tests for the clip module."""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import shapely
|
||||
from shapely.geometry import (
|
||||
GeometryCollection,
|
||||
LinearRing,
|
||||
LineString,
|
||||
MultiPoint,
|
||||
Point,
|
||||
Polygon,
|
||||
box,
|
||||
)
|
||||
|
||||
import geopandas
|
||||
from geopandas import GeoDataFrame, GeoSeries, clip
|
||||
from geopandas._compat import HAS_PYPROJ
|
||||
from geopandas.tools.clip import _mask_is_list_like_rectangle
|
||||
|
||||
import pytest
|
||||
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
|
||||
from pandas.testing import assert_index_equal
|
||||
|
||||
mask_variants_single_rectangle = [
|
||||
"single_rectangle_gdf",
|
||||
"single_rectangle_gdf_list_bounds",
|
||||
"single_rectangle_gdf_tuple_bounds",
|
||||
"single_rectangle_gdf_array_bounds",
|
||||
]
|
||||
mask_variants_large_rectangle = [
|
||||
"larger_single_rectangle_gdf",
|
||||
"larger_single_rectangle_gdf_bounds",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def point_gdf():
|
||||
"""Create a point GeoDataFrame."""
|
||||
pts = np.array([[2, 2], [3, 4], [9, 8], [-12, -15]])
|
||||
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def point_gdf2():
|
||||
"""Create a point GeoDataFrame."""
|
||||
pts = np.array([[5, 5], [2, 2], [4, 4], [0, 0], [3, 3], [1, 1]])
|
||||
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pointsoutside_nooverlap_gdf():
|
||||
"""Create a point GeoDataFrame. Its points are all outside the single
|
||||
rectangle, and its bounds are outside the single rectangle's."""
|
||||
pts = np.array([[5, 15], [15, 15], [15, 20]])
|
||||
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pointsoutside_overlap_gdf():
|
||||
"""Create a point GeoDataFrame. Its points are all outside the single
|
||||
rectangle, and its bounds are overlapping the single rectangle's."""
|
||||
pts = np.array([[5, 15], [15, 15], [15, 5]])
|
||||
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def single_rectangle_gdf():
|
||||
"""Create a single rectangle for clipping."""
|
||||
poly_inters = Polygon([(0, 0), (0, 10), (10, 10), (10, 0), (0, 0)])
|
||||
gdf = GeoDataFrame([1], geometry=[poly_inters], crs="EPSG:3857")
|
||||
gdf["attr2"] = "site-boundary"
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def single_rectangle_gdf_tuple_bounds(single_rectangle_gdf):
|
||||
"""Bounds of the created single rectangle"""
|
||||
return tuple(single_rectangle_gdf.total_bounds)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def single_rectangle_gdf_list_bounds(single_rectangle_gdf):
|
||||
"""Bounds of the created single rectangle"""
|
||||
return list(single_rectangle_gdf.total_bounds)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def single_rectangle_gdf_array_bounds(single_rectangle_gdf):
|
||||
"""Bounds of the created single rectangle"""
|
||||
return single_rectangle_gdf.total_bounds
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def larger_single_rectangle_gdf():
|
||||
"""Create a slightly larger rectangle for clipping.
|
||||
The smaller single rectangle is used to test the edge case where slivers
|
||||
are returned when you clip polygons. This fixture is larger which
|
||||
eliminates the slivers in the clip return.
|
||||
"""
|
||||
poly_inters = Polygon([(-5, -5), (-5, 15), (15, 15), (15, -5), (-5, -5)])
|
||||
gdf = GeoDataFrame([1], geometry=[poly_inters], crs="EPSG:3857")
|
||||
gdf["attr2"] = ["study area"]
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def larger_single_rectangle_gdf_bounds(larger_single_rectangle_gdf):
|
||||
"""Bounds of the created single rectangle"""
|
||||
return tuple(larger_single_rectangle_gdf.total_bounds)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def buffered_locations(point_gdf):
|
||||
"""Buffer points to create a multi-polygon."""
|
||||
buffered_locs = point_gdf
|
||||
buffered_locs["geometry"] = buffered_locs.buffer(4)
|
||||
buffered_locs["type"] = "plot"
|
||||
return buffered_locs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def donut_geometry(buffered_locations, single_rectangle_gdf):
|
||||
"""Make a geometry with a hole in the middle (a donut)."""
|
||||
donut = geopandas.overlay(
|
||||
buffered_locations, single_rectangle_gdf, how="symmetric_difference"
|
||||
)
|
||||
return donut
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def two_line_gdf():
|
||||
"""Create Line Objects For Testing"""
|
||||
linea = LineString([(1, 1), (2, 2), (3, 2), (5, 3)])
|
||||
lineb = LineString([(3, 4), (5, 7), (12, 2), (10, 5), (9, 7.5)])
|
||||
gdf = GeoDataFrame([1, 2], geometry=[linea, lineb], crs="EPSG:3857")
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multi_poly_gdf(donut_geometry):
|
||||
"""Create a multi-polygon GeoDataFrame."""
|
||||
multi_poly = donut_geometry.union_all()
|
||||
out_df = GeoDataFrame(geometry=GeoSeries(multi_poly), crs="EPSG:3857")
|
||||
out_df["attr"] = ["pool"]
|
||||
return out_df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multi_line(two_line_gdf):
|
||||
"""Create a multi-line GeoDataFrame.
|
||||
This GDF has one multiline and one regular line."""
|
||||
# Create a single and multi line object
|
||||
multiline_feat = two_line_gdf.union_all()
|
||||
linec = LineString([(2, 1), (3, 1), (4, 1), (5, 2)])
|
||||
out_df = GeoDataFrame(geometry=GeoSeries([multiline_feat, linec]), crs="EPSG:3857")
|
||||
out_df["attr"] = ["road", "stream"]
|
||||
return out_df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multi_point(point_gdf):
|
||||
"""Create a multi-point GeoDataFrame."""
|
||||
multi_point = point_gdf.union_all()
|
||||
out_df = GeoDataFrame(
|
||||
geometry=GeoSeries(
|
||||
[multi_point, Point(2, 5), Point(-11, -14), Point(-10, -12)]
|
||||
),
|
||||
crs="EPSG:3857",
|
||||
)
|
||||
out_df["attr"] = ["tree", "another tree", "shrub", "berries"]
|
||||
return out_df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mixed_gdf():
|
||||
"""Create a Mixed Polygon and LineString For Testing"""
|
||||
point = Point(2, 3)
|
||||
line = LineString([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
|
||||
poly = Polygon([(3, 4), (5, 2), (12, 2), (10, 5), (9, 7.5)])
|
||||
ring = LinearRing([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
|
||||
gdf = GeoDataFrame(
|
||||
[1, 2, 3, 4], geometry=[point, poly, line, ring], crs="EPSG:3857"
|
||||
)
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def geomcol_gdf():
|
||||
"""Create a Mixed Polygon and LineString For Testing"""
|
||||
point = Point(2, 3)
|
||||
poly = Polygon([(3, 4), (5, 2), (12, 2), (10, 5), (9, 7.5)])
|
||||
coll = GeometryCollection([point, poly])
|
||||
gdf = GeoDataFrame([1], geometry=[coll], crs="EPSG:3857")
|
||||
return gdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sliver_line():
|
||||
"""Create a line that will create a point when clipped."""
|
||||
linea = LineString([(10, 5), (13, 5), (15, 5)])
|
||||
lineb = LineString([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
|
||||
gdf = GeoDataFrame([1, 2], geometry=[linea, lineb], crs="EPSG:3857")
|
||||
return gdf
|
||||
|
||||
|
||||
def test_not_gdf(single_rectangle_gdf):
|
||||
"""Non-GeoDataFrame inputs raise attribute errors."""
|
||||
with pytest.raises(TypeError):
|
||||
clip((2, 3), single_rectangle_gdf)
|
||||
with pytest.raises(TypeError):
|
||||
clip(single_rectangle_gdf, "foobar")
|
||||
with pytest.raises(TypeError):
|
||||
clip(single_rectangle_gdf, (1, 2, 3))
|
||||
with pytest.raises(TypeError):
|
||||
clip(single_rectangle_gdf, (1, 2, 3, 4, 5))
|
||||
|
||||
|
||||
def test_non_overlapping_geoms():
|
||||
"""Test that a bounding box returns empty if the extents don't overlap"""
|
||||
unit_box = Polygon([(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)])
|
||||
unit_gdf = GeoDataFrame([1], geometry=[unit_box], crs="EPSG:3857")
|
||||
non_overlapping_gdf = unit_gdf.copy()
|
||||
non_overlapping_gdf = non_overlapping_gdf.geometry.apply(
|
||||
lambda x: shapely.affinity.translate(x, xoff=20)
|
||||
)
|
||||
out = clip(unit_gdf, non_overlapping_gdf)
|
||||
assert_geodataframe_equal(out, unit_gdf.iloc[:0])
|
||||
out2 = clip(unit_gdf.geometry, non_overlapping_gdf)
|
||||
assert_geoseries_equal(out2, GeoSeries(crs=unit_gdf.crs))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mask_fixture_name", mask_variants_single_rectangle)
|
||||
class TestClipWithSingleRectangleGdf:
|
||||
@pytest.fixture
|
||||
def mask(self, mask_fixture_name, request):
|
||||
return request.getfixturevalue(mask_fixture_name)
|
||||
|
||||
def test_returns_gdf(self, point_gdf, mask):
|
||||
"""Test that function returns a GeoDataFrame (or GDF-like) object."""
|
||||
out = clip(point_gdf, mask)
|
||||
assert isinstance(out, GeoDataFrame)
|
||||
|
||||
def test_returns_series(self, point_gdf, mask):
|
||||
"""Test that function returns a GeoSeries if GeoSeries is passed."""
|
||||
out = clip(point_gdf.geometry, mask)
|
||||
assert isinstance(out, GeoSeries)
|
||||
|
||||
def test_clip_points(self, point_gdf, mask):
|
||||
"""Test clipping a points GDF with a generic polygon geometry."""
|
||||
clip_pts = clip(point_gdf, mask)
|
||||
pts = np.array([[2, 2], [3, 4], [9, 8]])
|
||||
exp = GeoDataFrame(
|
||||
[Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857"
|
||||
)
|
||||
assert_geodataframe_equal(clip_pts, exp)
|
||||
|
||||
def test_clip_points_geom_col_rename(self, point_gdf, mask):
|
||||
"""Test clipping a points GDF with a generic polygon geometry."""
|
||||
point_gdf_geom_col_rename = point_gdf.rename_geometry("geometry2")
|
||||
clip_pts = clip(point_gdf_geom_col_rename, mask)
|
||||
pts = np.array([[2, 2], [3, 4], [9, 8]])
|
||||
exp = GeoDataFrame(
|
||||
[Point(xy) for xy in pts],
|
||||
columns=["geometry2"],
|
||||
crs="EPSG:3857",
|
||||
geometry="geometry2",
|
||||
)
|
||||
assert_geodataframe_equal(clip_pts, exp)
|
||||
|
||||
def test_clip_poly(self, buffered_locations, mask):
|
||||
"""Test clipping a polygon GDF with a generic polygon geometry."""
|
||||
clipped_poly = clip(buffered_locations, mask)
|
||||
assert len(clipped_poly.geometry) == 3
|
||||
assert all(clipped_poly.geom_type == "Polygon")
|
||||
|
||||
def test_clip_poly_geom_col_rename(self, buffered_locations, mask):
|
||||
"""Test clipping a polygon GDF with a generic polygon geometry."""
|
||||
|
||||
poly_gdf_geom_col_rename = buffered_locations.rename_geometry("geometry2")
|
||||
clipped_poly = clip(poly_gdf_geom_col_rename, mask)
|
||||
assert len(clipped_poly.geometry) == 3
|
||||
assert "geometry" not in clipped_poly.keys()
|
||||
assert "geometry2" in clipped_poly.keys()
|
||||
|
||||
def test_clip_poly_series(self, buffered_locations, mask):
|
||||
"""Test clipping a polygon GDF with a generic polygon geometry."""
|
||||
clipped_poly = clip(buffered_locations.geometry, mask)
|
||||
assert len(clipped_poly) == 3
|
||||
assert all(clipped_poly.geom_type == "Polygon")
|
||||
|
||||
def test_clip_multipoly_keep_geom_type(self, multi_poly_gdf, mask):
|
||||
"""Test a multi poly object where the return includes a sliver.
|
||||
Also the bounds of the object should == the bounds of the clip object
|
||||
if they fully overlap (as they do in these fixtures)."""
|
||||
clipped = clip(multi_poly_gdf, mask, keep_geom_type=True)
|
||||
expected_bounds = (
|
||||
mask if _mask_is_list_like_rectangle(mask) else mask.total_bounds
|
||||
)
|
||||
assert np.array_equal(clipped.total_bounds, expected_bounds)
|
||||
# Assert returned data is a not geometry collection
|
||||
assert (clipped.geom_type.isin(["Polygon", "MultiPolygon"])).all()
|
||||
|
||||
def test_clip_multiline(self, multi_line, mask):
|
||||
"""Test that clipping a multiline feature with a poly returns expected
|
||||
output."""
|
||||
clipped = clip(multi_line, mask)
|
||||
assert clipped.geom_type[0] == "MultiLineString"
|
||||
|
||||
def test_clip_multipoint(self, multi_point, mask):
|
||||
"""Clipping a multipoint feature with a polygon works as expected.
|
||||
should return a geodataframe with a single multi point feature"""
|
||||
clipped = clip(multi_point, mask)
|
||||
assert clipped.geom_type[0] == "MultiPoint"
|
||||
assert hasattr(clipped, "attr")
|
||||
# All points should intersect the clip geom
|
||||
assert len(clipped) == 2
|
||||
clipped_mutltipoint = MultiPoint(
|
||||
[
|
||||
Point(2, 2),
|
||||
Point(3, 4),
|
||||
Point(9, 8),
|
||||
]
|
||||
)
|
||||
assert clipped.iloc[0].geometry.wkt == clipped_mutltipoint.wkt
|
||||
shape_for_points = (
|
||||
box(*mask) if _mask_is_list_like_rectangle(mask) else mask.union_all()
|
||||
)
|
||||
assert all(clipped.intersects(shape_for_points))
|
||||
|
||||
def test_clip_lines(self, two_line_gdf, mask):
|
||||
"""Test what happens when you give the clip_extent a line GDF."""
|
||||
clip_line = clip(two_line_gdf, mask)
|
||||
assert len(clip_line.geometry) == 2
|
||||
|
||||
def test_mixed_geom(self, mixed_gdf, mask):
|
||||
"""Test clipping a mixed GeoDataFrame"""
|
||||
clipped = clip(mixed_gdf, mask)
|
||||
assert (
|
||||
clipped.geom_type[0] == "Point"
|
||||
and clipped.geom_type[1] == "Polygon"
|
||||
and clipped.geom_type[2] == "LineString"
|
||||
)
|
||||
|
||||
def test_mixed_series(self, mixed_gdf, mask):
|
||||
"""Test clipping a mixed GeoSeries"""
|
||||
clipped = clip(mixed_gdf.geometry, mask)
|
||||
assert (
|
||||
clipped.geom_type[0] == "Point"
|
||||
and clipped.geom_type[1] == "Polygon"
|
||||
and clipped.geom_type[2] == "LineString"
|
||||
)
|
||||
|
||||
def test_clip_with_line_extra_geom(self, sliver_line, mask):
|
||||
"""When the output of a clipped line returns a geom collection,
|
||||
and keep_geom_type is True, no geometry collections should be returned."""
|
||||
clipped = clip(sliver_line, mask, keep_geom_type=True)
|
||||
assert len(clipped.geometry) == 1
|
||||
# Assert returned data is a not geometry collection
|
||||
assert not (clipped.geom_type == "GeometryCollection").any()
|
||||
|
||||
def test_clip_no_box_overlap(self, pointsoutside_nooverlap_gdf, mask):
|
||||
"""Test clip when intersection is empty and boxes do not overlap."""
|
||||
clipped = clip(pointsoutside_nooverlap_gdf, mask)
|
||||
assert len(clipped) == 0
|
||||
|
||||
def test_clip_box_overlap(self, pointsoutside_overlap_gdf, mask):
|
||||
"""Test clip when intersection is empty and boxes do overlap."""
|
||||
clipped = clip(pointsoutside_overlap_gdf, mask)
|
||||
assert len(clipped) == 0
|
||||
|
||||
def test_warning_extra_geoms_mixed(self, mixed_gdf, mask):
|
||||
"""Test the correct warnings are raised if keep_geom_type is
|
||||
called on a mixed GDF"""
|
||||
with pytest.warns(UserWarning):
|
||||
clip(mixed_gdf, mask, keep_geom_type=True)
|
||||
|
||||
def test_warning_geomcoll(self, geomcol_gdf, mask):
|
||||
"""Test the correct warnings are raised if keep_geom_type is
|
||||
called on a GDF with GeometryCollection"""
|
||||
with pytest.warns(UserWarning):
|
||||
clip(geomcol_gdf, mask, keep_geom_type=True)
|
||||
|
||||
|
||||
def test_clip_line_keep_slivers(sliver_line, single_rectangle_gdf):
|
||||
"""Test the correct output if a point is returned
|
||||
from a line only geometry type."""
|
||||
clipped = clip(sliver_line, single_rectangle_gdf)
|
||||
# Assert returned data is a geometry collection given sliver geoms
|
||||
assert "Point" == clipped.geom_type[0]
|
||||
assert "LineString" == clipped.geom_type[1]
|
||||
|
||||
|
||||
def test_clip_multipoly_keep_slivers(multi_poly_gdf, single_rectangle_gdf):
|
||||
"""Test a multi poly object where the return includes a sliver.
|
||||
Also the bounds of the object should == the bounds of the clip object
|
||||
if they fully overlap (as they do in these fixtures)."""
|
||||
clipped = clip(multi_poly_gdf, single_rectangle_gdf)
|
||||
assert np.array_equal(clipped.total_bounds, single_rectangle_gdf.total_bounds)
|
||||
# Assert returned data is a geometry collection given sliver geoms
|
||||
assert "GeometryCollection" in clipped.geom_type[0]
|
||||
|
||||
|
||||
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
|
||||
def test_warning_crs_mismatch(point_gdf, single_rectangle_gdf):
|
||||
with pytest.warns(UserWarning, match="CRS mismatch between the CRS"):
|
||||
clip(point_gdf, single_rectangle_gdf.to_crs(4326))
|
||||
|
||||
|
||||
def test_clip_with_polygon(single_rectangle_gdf):
|
||||
"""Test clip when using a shapely object"""
|
||||
polygon = Polygon([(0, 0), (5, 12), (10, 0), (0, 0)])
|
||||
clipped = clip(single_rectangle_gdf, polygon)
|
||||
exp_poly = polygon.intersection(
|
||||
Polygon([(0, 0), (0, 10), (10, 10), (10, 0), (0, 0)])
|
||||
)
|
||||
exp = GeoDataFrame([1], geometry=[exp_poly], crs="EPSG:3857")
|
||||
exp["attr2"] = "site-boundary"
|
||||
assert_geodataframe_equal(clipped, exp)
|
||||
|
||||
|
||||
def test_clip_with_multipolygon(buffered_locations, single_rectangle_gdf):
|
||||
"""Test clipping a polygon with a multipolygon."""
|
||||
multi = buffered_locations.dissolve(by="type").reset_index()
|
||||
clipped = clip(single_rectangle_gdf, multi)
|
||||
assert clipped.geom_type[0] == "Polygon"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"mask_fixture_name",
|
||||
mask_variants_large_rectangle,
|
||||
)
|
||||
def test_clip_single_multipoly_no_extra_geoms(
|
||||
buffered_locations, mask_fixture_name, request
|
||||
):
|
||||
"""When clipping a multi-polygon feature, no additional geom types
|
||||
should be returned."""
|
||||
masks = request.getfixturevalue(mask_fixture_name)
|
||||
multi = buffered_locations.dissolve(by="type").reset_index()
|
||||
clipped = clip(multi, masks)
|
||||
assert clipped.geom_type[0] == "Polygon"
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore:All-NaN slice encountered")
|
||||
@pytest.mark.parametrize(
|
||||
"mask",
|
||||
[
|
||||
Polygon(),
|
||||
(np.nan,) * 4,
|
||||
(np.nan, 0, np.nan, 1),
|
||||
GeoSeries([Polygon(), Polygon()], crs="EPSG:3857"),
|
||||
GeoSeries([Polygon(), Polygon()], crs="EPSG:3857").to_frame(),
|
||||
GeoSeries([], crs="EPSG:3857"),
|
||||
GeoSeries([], crs="EPSG:3857").to_frame(),
|
||||
],
|
||||
)
|
||||
def test_clip_empty_mask(buffered_locations, mask):
|
||||
"""Test that clipping with empty mask returns an empty result."""
|
||||
clipped = clip(buffered_locations, mask)
|
||||
assert_geodataframe_equal(
|
||||
clipped,
|
||||
GeoDataFrame([], columns=["geometry", "type"], crs="EPSG:3857"),
|
||||
check_index_type=False,
|
||||
)
|
||||
clipped = clip(buffered_locations.geometry, mask)
|
||||
assert_geoseries_equal(clipped, GeoSeries([], crs="EPSG:3857"))
|
||||
|
||||
|
||||
def test_clip_sorting(point_gdf2):
|
||||
"""Test the sorting kwarg in clip"""
|
||||
bbox = shapely.geometry.box(0, 0, 2, 2)
|
||||
unsorted_clipped_gdf = point_gdf2.clip(bbox)
|
||||
sorted_clipped_gdf = point_gdf2.clip(bbox, sort=True)
|
||||
|
||||
expected_sorted_index = pd.Index([1, 3, 5])
|
||||
|
||||
assert not (sorted(unsorted_clipped_gdf.index) == unsorted_clipped_gdf.index).all()
|
||||
assert (sorted(sorted_clipped_gdf.index) == sorted_clipped_gdf.index).all()
|
||||
assert_index_equal(expected_sorted_index, sorted_clipped_gdf.index)
|
||||
@@ -0,0 +1,76 @@
|
||||
import numpy as np
|
||||
|
||||
from shapely.geometry import Point
|
||||
from shapely.wkt import loads
|
||||
|
||||
import geopandas
|
||||
|
||||
import pytest
|
||||
from pandas.testing import assert_series_equal
|
||||
|
||||
|
||||
def test_hilbert_distance():
|
||||
# test the actual Hilbert Code algorithm against some hardcoded values
|
||||
geoms = geopandas.GeoSeries.from_wkt(
|
||||
[
|
||||
"POINT (0 0)",
|
||||
"POINT (1 1)",
|
||||
"POINT (1 0)",
|
||||
"POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))",
|
||||
]
|
||||
)
|
||||
result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=2)
|
||||
assert result.tolist() == [0, 10, 15, 2]
|
||||
|
||||
result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=3)
|
||||
assert result.tolist() == [0, 42, 63, 10]
|
||||
|
||||
result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=16)
|
||||
assert result.tolist() == [0, 2863311530, 4294967295, 715827882]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def geoseries_points():
|
||||
p1 = Point(1, 2)
|
||||
p2 = Point(2, 3)
|
||||
p3 = Point(3, 4)
|
||||
p4 = Point(4, 1)
|
||||
return geopandas.GeoSeries([p1, p2, p3, p4])
|
||||
|
||||
|
||||
def test_hilbert_distance_level(geoseries_points):
|
||||
with pytest.raises(ValueError):
|
||||
geoseries_points.hilbert_distance(level=20)
|
||||
|
||||
|
||||
def test_specified_total_bounds(geoseries_points):
|
||||
result = geoseries_points.hilbert_distance(
|
||||
total_bounds=geoseries_points.total_bounds
|
||||
)
|
||||
expected = geoseries_points.hilbert_distance()
|
||||
assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"empty",
|
||||
[
|
||||
None,
|
||||
loads("POLYGON EMPTY"),
|
||||
],
|
||||
)
|
||||
def test_empty(geoseries_points, empty):
|
||||
s = geoseries_points
|
||||
s.iloc[-1] = empty
|
||||
with pytest.raises(
|
||||
ValueError, match="cannot be computed on a GeoSeries with empty"
|
||||
):
|
||||
s.hilbert_distance()
|
||||
|
||||
|
||||
def test_zero_width():
|
||||
# special case of all points on the same line -> avoid warnings because
|
||||
# of division by 0 and introducing NaN
|
||||
s = geopandas.GeoSeries([Point(0, 0), Point(0, 2), Point(0, 1)])
|
||||
with np.errstate(all="raise"):
|
||||
result = s.hilbert_distance()
|
||||
assert np.array(result).argsort().tolist() == [0, 2, 1]
|
||||
@@ -0,0 +1,67 @@
|
||||
import numpy
|
||||
|
||||
import geopandas
|
||||
from geopandas.tools._random import uniform
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multipolygons(nybb_filename):
|
||||
return geopandas.read_file(nybb_filename).geometry
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def polygons(multipolygons):
|
||||
return multipolygons.explode(ignore_index=True).geometry
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def multilinestrings(multipolygons):
|
||||
return multipolygons.boundary
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def linestrings(polygons):
|
||||
return polygons.boundary
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def points(multipolygons):
|
||||
return multipolygons.centroid
|
||||
|
||||
|
||||
@pytest.mark.parametrize("size", [10, 100])
|
||||
@pytest.mark.parametrize(
|
||||
"geom_fixture", ["multipolygons", "polygons", "multilinestrings", "linestrings"]
|
||||
)
|
||||
def test_uniform(geom_fixture, size, request):
|
||||
geom = request.getfixturevalue(geom_fixture)[0]
|
||||
sample = uniform(geom, size=size, rng=1)
|
||||
sample_series = (
|
||||
geopandas.GeoSeries(sample).explode(index_parts=True).reset_index(drop=True)
|
||||
)
|
||||
assert len(sample_series) == size
|
||||
sample_in_geom = sample_series.buffer(0.00000001).sindex.query(
|
||||
geom, predicate="intersects"
|
||||
)
|
||||
assert len(sample_in_geom) == size
|
||||
|
||||
|
||||
def test_uniform_unsupported(points):
|
||||
with pytest.warns(UserWarning, match="Sampling is not supported"):
|
||||
sample = uniform(points[0], size=10, rng=1)
|
||||
assert sample.is_empty
|
||||
|
||||
|
||||
def test_uniform_generator(polygons):
|
||||
sample = uniform(polygons[0], size=10, rng=1)
|
||||
sample2 = uniform(polygons[0], size=10, rng=1)
|
||||
assert sample.equals(sample2)
|
||||
|
||||
generator = numpy.random.default_rng(seed=1)
|
||||
gen_sample = uniform(polygons[0], size=10, rng=generator)
|
||||
gen_sample2 = uniform(polygons[0], size=10, rng=generator)
|
||||
|
||||
assert sample.equals(gen_sample)
|
||||
assert not sample.equals(gen_sample2)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,51 @@
|
||||
from shapely.geometry import LineString, MultiPoint, Point
|
||||
|
||||
from geopandas import GeoSeries
|
||||
from geopandas.tools import collect
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class TestTools:
|
||||
def setup_method(self):
|
||||
self.p1 = Point(0, 0)
|
||||
self.p2 = Point(1, 1)
|
||||
self.p3 = Point(2, 2)
|
||||
self.mpc = MultiPoint([self.p1, self.p2, self.p3])
|
||||
|
||||
self.mp1 = MultiPoint([self.p1, self.p2])
|
||||
self.line1 = LineString([(3, 3), (4, 4)])
|
||||
|
||||
def test_collect_single(self):
|
||||
result = collect(self.p1)
|
||||
assert self.p1.equals(result)
|
||||
|
||||
def test_collect_single_force_multi(self):
|
||||
result = collect(self.p1, multi=True)
|
||||
expected = MultiPoint([self.p1])
|
||||
assert expected.equals(result)
|
||||
|
||||
def test_collect_multi(self):
|
||||
result = collect(self.mp1)
|
||||
assert self.mp1.equals(result)
|
||||
|
||||
def test_collect_multi_force_multi(self):
|
||||
result = collect(self.mp1)
|
||||
assert self.mp1.equals(result)
|
||||
|
||||
def test_collect_list(self):
|
||||
result = collect([self.p1, self.p2, self.p3])
|
||||
assert self.mpc.equals(result)
|
||||
|
||||
def test_collect_GeoSeries(self):
|
||||
s = GeoSeries([self.p1, self.p2, self.p3])
|
||||
result = collect(s)
|
||||
assert self.mpc.equals(result)
|
||||
|
||||
def test_collect_mixed_types(self):
|
||||
with pytest.raises(ValueError):
|
||||
collect([self.p1, self.line1])
|
||||
|
||||
def test_collect_mixed_multi(self):
|
||||
with pytest.raises(ValueError):
|
||||
collect([self.mpc, self.mp1])
|
||||
@@ -0,0 +1,45 @@
|
||||
import pandas as pd
|
||||
|
||||
from shapely.geometry import MultiLineString, MultiPoint, MultiPolygon
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
|
||||
_multi_type_map = {
|
||||
"Point": MultiPoint,
|
||||
"LineString": MultiLineString,
|
||||
"Polygon": MultiPolygon,
|
||||
}
|
||||
|
||||
|
||||
def collect(x, multi=False):
|
||||
"""
|
||||
Collect single part geometries into their Multi* counterpart
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : an iterable or Series of Shapely geometries, a GeoSeries, or
|
||||
a single Shapely geometry
|
||||
multi : boolean, default False
|
||||
if True, force returned geometries to be Multi* even if they
|
||||
only have one component.
|
||||
|
||||
"""
|
||||
if isinstance(x, BaseGeometry):
|
||||
x = [x]
|
||||
elif isinstance(x, pd.Series):
|
||||
x = list(x)
|
||||
|
||||
# We cannot create GeometryCollection here so all types
|
||||
# must be the same. If there is more than one element,
|
||||
# they cannot be Multi*, i.e., can't pass in combination of
|
||||
# Point and MultiPoint... or even just MultiPoint
|
||||
t = x[0].geom_type
|
||||
if not all(g.geom_type == t for g in x):
|
||||
raise ValueError("Geometry type must be homogeneous")
|
||||
if len(x) > 1 and t.startswith("Multi"):
|
||||
raise ValueError("Cannot collect {0}. Must have single geometries".format(t))
|
||||
|
||||
if len(x) == 1 and (t.startswith("Multi") or not multi):
|
||||
# If there's only one single part geom and we're not forcing to
|
||||
# multi, then just return it
|
||||
return x[0]
|
||||
return _multi_type_map[t](x)
|
||||
Reference in New Issue
Block a user