refactor: excel parse

This commit is contained in:
Blizzard
2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
@@ -0,0 +1,29 @@
from geopandas._config import options
from geopandas.geoseries import GeoSeries
from geopandas.geodataframe import GeoDataFrame
from geopandas.array import points_from_xy
from geopandas.io.file import _read_file as read_file
from geopandas.io.file import _list_layers as list_layers
from geopandas.io.arrow import _read_parquet as read_parquet
from geopandas.io.arrow import _read_feather as read_feather
from geopandas.io.sql import _read_postgis as read_postgis
from geopandas.tools import sjoin, sjoin_nearest
from geopandas.tools import overlay
from geopandas.tools._show_versions import show_versions
from geopandas.tools import clip
import geopandas.datasets
# make the interactive namespace easier to use
# for `from geopandas import *` demos.
import geopandas as gpd
import pandas as pd
import numpy as np
from . import _version
__version__ = _version.get_versions()["version"]
@@ -0,0 +1,92 @@
import importlib
from packaging.version import Version
import pandas as pd
import shapely
import shapely.geos
# -----------------------------------------------------------------------------
# pandas compat
# -----------------------------------------------------------------------------
PANDAS_GE_14 = Version(pd.__version__) >= Version("1.4.0rc0")
PANDAS_GE_15 = Version(pd.__version__) >= Version("1.5.0")
PANDAS_GE_20 = Version(pd.__version__) >= Version("2.0.0")
PANDAS_GE_202 = Version(pd.__version__) >= Version("2.0.2")
PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")
PANDAS_GE_22 = Version(pd.__version__) >= Version("2.2.0")
PANDAS_GE_30 = Version(pd.__version__) >= Version("3.0.0.dev0")
# -----------------------------------------------------------------------------
# Shapely / GEOS compat
# -----------------------------------------------------------------------------
SHAPELY_GE_204 = Version(shapely.__version__) >= Version("2.0.4")
GEOS_GE_390 = shapely.geos.geos_version >= (3, 9, 0)
GEOS_GE_310 = shapely.geos.geos_version >= (3, 10, 0)
def import_optional_dependency(name: str, extra: str = ""):
"""
Import an optional dependency.
Adapted from pandas.compat._optional::import_optional_dependency
Raises a formatted ImportError if the module is not present.
Parameters
----------
name : str
The module name.
extra : str
Additional text to include in the ImportError message.
Returns
-------
module
"""
msg = """Missing optional dependency '{name}'. {extra} "
"Use pip or conda to install {name}.""".format(
name=name, extra=extra
)
if not isinstance(name, str):
raise ValueError(
"Invalid module name: '{name}'; must be a string".format(name=name)
)
try:
module = importlib.import_module(name)
except ImportError:
raise ImportError(msg) from None
return module
# -----------------------------------------------------------------------------
# pyproj compat
# -----------------------------------------------------------------------------
try:
import pyproj # noqa: F401
HAS_PYPROJ = True
except ImportError as err:
HAS_PYPROJ = False
pyproj_import_error = str(err)
def requires_pyproj(func):
def wrapper(*args, **kwargs):
if not HAS_PYPROJ:
raise ImportError(
f"The 'pyproj' package is required for {func.__name__} to work. "
"Install it and initialize the object with a CRS before using it."
f"\nImporting pyproj resulted in: {pyproj_import_error}"
)
return func(*args, **kwargs)
return wrapper
@@ -0,0 +1,133 @@
"""
Lightweight options machinery.
Based on https://github.com/topper-123/optioneer, but simplified (don't deal
with nested options, deprecated options, ..), just the attribute-style dict
like holding the options and giving a nice repr.
"""
import textwrap
import warnings
from collections import namedtuple
Option = namedtuple("Option", "key default_value doc validator callback")
class Options(object):
"""Provide attribute-style access to configuration dict."""
def __init__(self, options):
super().__setattr__("_options", options)
# populate with default values
config = {}
for key, option in options.items():
config[key] = option.default_value
super().__setattr__("_config", config)
def __setattr__(self, key, value):
# you can't set new keys
if key in self._config:
option = self._options[key]
if option.validator:
option.validator(value)
self._config[key] = value
if option.callback:
option.callback(key, value)
else:
msg = "You can only set the value of existing options"
raise AttributeError(msg)
def __getattr__(self, key):
try:
return self._config[key]
except KeyError:
raise AttributeError("No such option")
def __dir__(self):
return list(self._config.keys())
def __repr__(self):
cls = self.__class__.__name__
description = ""
for key, option in self._options.items():
descr = "{key}: {cur!r} [default: {default!r}]\n".format(
key=key, cur=self._config[key], default=option.default_value
)
description += descr
if option.doc:
doc_text = "\n".join(textwrap.wrap(option.doc, width=70))
else:
doc_text = "No description available."
doc_text = textwrap.indent(doc_text, prefix=" ")
description += doc_text + "\n"
space = "\n "
description = description.replace("\n", space)
return "{}({}{})".format(cls, space, description)
def _validate_display_precision(value):
if value is not None:
if not isinstance(value, int) or not (0 <= value <= 16):
raise ValueError("Invalid value, needs to be an integer [0-16]")
display_precision = Option(
key="display_precision",
default_value=None,
doc=(
"The precision (maximum number of decimals) of the coordinates in "
"the WKT representation in the Series/DataFrame display. "
"By default (None), it tries to infer and use 3 decimals for projected "
"coordinates and 5 decimals for geographic coordinates."
),
validator=_validate_display_precision,
callback=None,
)
def _warn_use_pygeos_deprecated(_value):
warnings.warn(
"pygeos support was removed in 1.0. "
"geopandas.use_pygeos is a no-op and will be removed in geopandas 1.1.",
stacklevel=3,
)
def _validate_io_engine(value):
if value is not None:
if value not in ("pyogrio", "fiona"):
raise ValueError(f"Expected 'pyogrio' or 'fiona', got '{value}'")
io_engine = Option(
key="io_engine",
default_value=None,
doc=(
"The default engine for ``read_file`` and ``to_file``. "
"Options are 'pyogrio' and 'fiona'."
),
validator=_validate_io_engine,
callback=None,
)
# TODO: deprecate this
use_pygeos = Option(
key="use_pygeos",
default_value=False,
doc=(
"Deprecated option previously used to enable PyGEOS. "
"It will be removed in GeoPandas 1.1."
),
validator=_warn_use_pygeos_deprecated,
callback=None,
)
options = Options(
{
"display_precision": display_precision,
"use_pygeos": use_pygeos,
"io_engine": io_engine,
}
)
@@ -0,0 +1,52 @@
from textwrap import dedent
from typing import Callable, Union
# doc decorator function ported with modifications from Pandas
# https://github.com/pandas-dev/pandas/blob/master/pandas/util/_decorators.py
def doc(*docstrings: Union[str, Callable], **params) -> Callable:
"""
A decorator take docstring templates, concatenate them and perform string
substitution on it.
This decorator will add a variable "_docstring_components" to the wrapped
callable to keep track the original docstring template for potential usage.
If it should be consider as a template, it will be saved as a string.
Otherwise, it will be saved as callable, and later user __doc__ and dedent
to get docstring.
Parameters
----------
*docstrings : str or callable
The string / docstring / docstring template to be appended in order
after default docstring under callable.
**params
The string which would be used to format docstring template.
"""
def decorator(decorated: Callable) -> Callable:
# collecting docstring and docstring templates
docstring_components: list[Union[str, Callable]] = []
if decorated.__doc__:
docstring_components.append(dedent(decorated.__doc__))
for docstring in docstrings:
if hasattr(docstring, "_docstring_components"):
docstring_components.extend(docstring._docstring_components)
elif isinstance(docstring, str) or docstring.__doc__:
docstring_components.append(docstring)
# formatting templates and concatenating docstring
decorated.__doc__ = "".join(
(
component.format(**params)
if isinstance(component, str)
else dedent(component.__doc__ or "")
)
for component in docstring_components
)
decorated._docstring_components = docstring_components
return decorated
return decorator
@@ -0,0 +1,21 @@
# This file was generated by 'versioneer.py' (0.29) from
# revision-control system data, or from the parent directory name of an
# unpacked source archive. Distribution tarballs contain a pre-generated copy
# of this file.
import json
version_json = '''
{
"date": "2024-07-02T14:23:16+0200",
"dirty": false,
"error": null,
"full-revisionid": "747d66ee6fcf00b819c08f11ecded53736c4652b",
"version": "1.0.1"
}
''' # END VERSION_JSON
def get_versions():
return json.loads(version_json)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,47 @@
import os.path
import geopandas
import pytest
from geopandas.tests.util import _NATURALEARTH_CITIES, _NATURALEARTH_LOWRES, _NYBB
@pytest.fixture(autouse=True)
def add_geopandas(doctest_namespace):
doctest_namespace["geopandas"] = geopandas
# Datasets used in our tests
@pytest.fixture(scope="session")
def naturalearth_lowres() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NATURALEARTH_LOWRES) or os.getenv("GITHUB_ACTIONS"):
return _NATURALEARTH_LOWRES
else:
pytest.skip("Naturalearth lowres dataset not found")
@pytest.fixture(scope="session")
def naturalearth_cities() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NATURALEARTH_CITIES) or os.getenv("GITHUB_ACTIONS"):
return _NATURALEARTH_CITIES
else:
pytest.skip("Naturalearth cities dataset not found")
@pytest.fixture(scope="session")
def nybb_filename() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NYBB[len("zip://") :]) or os.getenv("GITHUB_ACTIONS"):
return _NYBB
else:
pytest.skip("NYBB dataset not found")
@pytest.fixture(scope="class")
def _setup_class_nybb_filename(nybb_filename, request):
"""Attach nybb_filename class attribute for unittest style setup_method"""
request.cls.nybb_filename = nybb_filename
@@ -0,0 +1,25 @@
__all__ = []
available = [] # previously part of __all__
_prev_available = ["naturalearth_cities", "naturalearth_lowres", "nybb"]
def get_path(dataset):
ne_message = "https://www.naturalearthdata.com/downloads/110m-cultural-vectors/."
nybb_message = (
"the geodatasets package.\n\nfrom geodatasets import get_path\n"
"path_to_file = get_path('nybb')\n"
)
error_msg = (
"The geopandas.dataset has been deprecated and was removed in GeoPandas "
f"1.0. You can get the original '{dataset}' data from "
f"{ne_message if 'natural' in dataset else nybb_message}"
)
if dataset in _prev_available:
raise AttributeError(error_msg)
else:
error_msg = (
"The geopandas.dataset has been deprecated and "
"was removed in GeoPandas 1.0. New sample datasets are now available "
"in the geodatasets package (https://geodatasets.readthedocs.io/en/latest/)"
)
raise AttributeError(error_msg)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,614 @@
import json
from packaging.version import Version
from typing import Dict, Optional, Tuple
import numpy as np
import pandas as pd
import pyarrow as pa
from numpy.typing import NDArray
import shapely
from shapely import GeometryType
from geopandas import GeoDataFrame
from geopandas._compat import SHAPELY_GE_204
from geopandas.array import from_shapely, from_wkb
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
## GeoPandas -> GeoArrow
class ArrowTable:
"""
Wrapper class for Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.table(gdf.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_table):
self._pa_table = pa_table
def __arrow_c_stream__(self, requested_schema=None):
return self._pa_table.__arrow_c_stream__(requested_schema=requested_schema)
class GeoArrowArray:
"""
Wrapper class for a geometry array as Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_array/stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.array(ser.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_field, pa_array):
self._pa_array = pa_array
self._pa_field = pa_field
def __arrow_c_array__(self, requested_schema=None):
if requested_schema is not None:
raise NotImplementedError(
"Requested schema is not supported for geometry arrays"
)
return (
self._pa_field.__arrow_c_schema__(),
self._pa_array.__arrow_c_array__()[1],
)
def geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
interleaved=True,
include_z=None,
):
"""
Convert GeoDataFrame to a pyarrow.Table.
Parameters
----------
df : GeoDataFrame
The GeoDataFrame to convert.
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
geometry_encoding : {'WKB', 'geoarrow' }, default 'WKB'
The GeoArrow encoding to use for the data conversion.
interleaved : bool, default True
Only relevant for 'geoarrow' encoding. If True, the geometries'
coordinates are interleaved in a single fixed size list array.
If False, the coordinates are stored as separate arrays in a
struct type.
include_z : bool, default None
Only relevant for 'geoarrow' encoding (for WKB, the dimensionality
of the individial geometries is preserved).
If False, return 2D geometries. If True, include the third dimension
in the output (if a geometry has no third dimension, the z-coordinates
will be NaN). By default, will infer the dimensionality from the
input geometries. Note that this inference can be unreliable with
empty geometries (for a guaranteed result, it is recommended to
specify the keyword).
"""
mask = df.dtypes == "geometry"
geometry_columns = df.columns[mask]
geometry_indices = np.asarray(mask).nonzero()[0]
df_attr = pd.DataFrame(df.copy(deep=False))
# replace geometry columns with dummy values -> will get converted to
# Arrow null column (not holding any memory), so we can afterwards
# fill the resulting table with the correct geometry fields
for col in geometry_columns:
df_attr[col] = None
table = pa.Table.from_pandas(df_attr, preserve_index=index)
geometry_encoding_dict = {}
if geometry_encoding.lower() == "geoarrow":
if Version(pa.__version__) < Version("10.0.0"):
raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
# Encode all geometry columns to GeoArrow
for i, col in zip(geometry_indices, geometry_columns):
field, geom_arr = construct_geometry_array(
np.array(df[col].array),
include_z=include_z,
field_name=col,
crs=df[col].crs,
interleaved=interleaved,
)
table = table.set_column(i, field, geom_arr)
geometry_encoding_dict[col] = (
field.metadata[b"ARROW:extension:name"]
.decode()
.removeprefix("geoarrow.")
)
elif geometry_encoding.lower() == "wkb":
# Encode all geometry columns to WKB
for i, col in zip(geometry_indices, geometry_columns):
field, wkb_arr = construct_wkb_array(
np.asarray(df[col].array), field_name=col, crs=df[col].crs
)
table = table.set_column(i, field, wkb_arr)
geometry_encoding_dict[col] = "WKB"
else:
raise ValueError(
f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
)
return table, geometry_encoding_dict
def construct_wkb_array(
shapely_arr: NDArray[np.object_],
*,
field_name: str = "geometry",
crs: Optional[str] = None,
) -> Tuple[pa.Field, pa.Array]:
if shapely.geos_version > (3, 10, 0):
kwargs = {"flavor": "iso"}
else:
if shapely.has_z(shapely_arr).any():
raise ValueError("Cannot write 3D geometries with GEOS<3.10")
kwargs = {}
wkb_arr = shapely.to_wkb(shapely_arr, **kwargs)
extension_metadata = {"ARROW:extension:name": "geoarrow.wkb"}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
field = pa.field(
field_name, type=pa.binary(), nullable=True, metadata=extension_metadata
)
parr = pa.array(np.asarray(wkb_arr), pa.binary())
return field, parr
def _convert_inner_coords(coords, interleaved, dims, mask=None):
if interleaved:
coords_field = pa.field(dims, pa.float64(), nullable=False)
typ = pa.list_(coords_field, len(dims))
if mask is None:
# mask keyword only added in pyarrow 15.0.0
parr = pa.FixedSizeListArray.from_arrays(coords.ravel(), type=typ)
else:
parr = pa.FixedSizeListArray.from_arrays(
coords.ravel(), type=typ, mask=mask
)
else:
if dims == "xy":
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy()], fields=fields, mask=mask
)
else:
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
pa.field("z", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy(), coords[:, 2].copy()],
fields=fields,
mask=mask,
)
return parr
def _linestring_type(point_type):
return pa.list_(pa.field("vertices", point_type, nullable=False))
def _polygon_type(point_type):
return pa.list_(
pa.field(
"rings",
pa.list_(pa.field("vertices", point_type, nullable=False)),
nullable=False,
)
)
def _multipoint_type(point_type):
return pa.list_(pa.field("points", point_type, nullable=False))
def _multilinestring_type(point_type):
return pa.list_(
pa.field("linestrings", _linestring_type(point_type), nullable=False)
)
def _multipolygon_type(point_type):
return pa.list_(pa.field("polygons", _polygon_type(point_type), nullable=False))
def construct_geometry_array(
shapely_arr: NDArray[np.object_],
include_z: Optional[bool] = None,
*,
field_name: str = "geometry",
crs: Optional[str] = None,
interleaved: bool = True,
) -> Tuple[pa.Field, pa.Array]:
# NOTE: this implementation returns a (field, array) pair so that it can set the
# extension metadata on the field without instantiating extension types into the
# global pyarrow registry
geom_type, coords, offsets = shapely.to_ragged_array(
shapely_arr, include_z=include_z
)
mask = shapely.is_missing(shapely_arr)
if mask.any():
if (
geom_type == GeometryType.POINT
and interleaved
and Version(pa.__version__) < Version("15.0.0")
):
raise ValueError(
"Converting point geometries with missing values is not supported "
"for interleaved coordinates with pyarrow < 15.0.0. Please "
"upgrade to a newer version of pyarrow."
)
mask = pa.array(mask, type=pa.bool_())
if geom_type == GeometryType.POINT and not SHAPELY_GE_204:
# bug in shapely < 2.0.4, see https://github.com/shapely/shapely/pull/2034
# this workaround only works if there are no empty points
indices = np.nonzero(mask)[0]
indices = indices - np.arange(len(indices))
coords = np.insert(coords, indices, np.nan, axis=0)
else:
mask = None
if coords.shape[-1] == 2:
dims = "xy"
elif coords.shape[-1] == 3:
dims = "xyz"
else:
raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
extension_metadata: Dict[str, str] = {}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
if geom_type == GeometryType.POINT:
parr = _convert_inner_coords(coords, interleaved, dims, mask=mask)
extension_metadata["ARROW:extension:name"] = "geoarrow.point"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.LINESTRING:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, _linestring_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.POLYGON:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_polygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOINT:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, type=_multipoint_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTILINESTRING:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_multilinestring_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOLYGON:
assert len(offsets) == 3, "Expected three offsets arrays"
ring_offsets, polygon_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
_parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2, mask=mask)
parr = parr.cast(_multipolygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
else:
raise ValueError(f"Unsupported type for geoarrow: {geom_type}")
## GeoArrow -> GeoPandas
def _get_arrow_geometry_field(field):
if (meta := field.metadata) is not None:
if (ext_name := meta.get(b"ARROW:extension:name", None)) is not None:
if ext_name.startswith(b"geoarrow."):
if (
ext_meta := meta.get(b"ARROW:extension:metadata", None)
) is not None:
ext_meta = json.loads(ext_meta.decode())
return ext_name.decode(), ext_meta
if isinstance(field.type, pa.ExtensionType):
ext_name = field.type.extension_name
if ext_name.startswith("geoarrow."):
ext_meta_ser = field.type.__arrow_ext_serialize__()
if ext_meta_ser:
ext_meta = json.loads(ext_meta_ser.decode())
else:
ext_meta = None
return ext_name, ext_meta
return None
def arrow_to_geopandas(table, geometry=None):
"""
Convert Arrow table object to a GeoDataFrame based on GeoArrow extension types.
Parameters
----------
table : pyarrow.Table
The Arrow table to convert.
geometry : str, default None
The name of the geometry column to set as the active geometry
column. If None, the first geometry column found will be used.
Returns
-------
GeoDataFrame
"""
if not isinstance(table, pa.Table):
table = pa.table(table)
geom_fields = []
for i, field in enumerate(table.schema):
geom = _get_arrow_geometry_field(field)
if geom is not None:
geom_fields.append((i, field.name, *geom))
if len(geom_fields) == 0:
raise ValueError("No geometry column found in the Arrow table.")
table_attr = table.drop([f[1] for f in geom_fields])
df = table_attr.to_pandas()
for i, col, ext_name, ext_meta in geom_fields:
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(
construct_shapely_array(table[col].combine_chunks(), ext_name), crs=crs
)
else:
raise TypeError(f"Unknown GeoArrow extension type: {ext_name}")
df.insert(i, col, geom_arr)
return GeoDataFrame(df, geometry=geometry or geom_fields[0][1])
def arrow_to_geometry_array(arr):
"""
Convert Arrow array object (representing single GeoArrow array) to a
geopandas GeometryArray.
Specifically for GeoSeries.from_arrow.
"""
if Version(pa.__version__) < Version("14.0.0"):
raise ValueError("Importing from Arrow requires pyarrow >= 14.0.")
schema_capsule, array_capsule = arr.__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
pa_arr = pa.Array._import_from_c_capsule(field.__arrow_c_schema__(), array_capsule)
geom_info = _get_arrow_geometry_field(field)
if geom_info is None:
raise ValueError("No GeoArrow geometry field found.")
ext_name, ext_meta = geom_info
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(pa_arr), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(construct_shapely_array(pa_arr, ext_name), crs=crs)
else:
raise ValueError(f"Unknown GeoArrow extension type: {ext_name}")
return geom_arr
def _get_inner_coords(arr):
if pa.types.is_struct(arr.type):
if arr.type.num_fields == 2:
coords = np.column_stack(
[np.asarray(arr.field("x")), np.asarray(arr.field("y"))]
)
else:
coords = np.column_stack(
[
np.asarray(arr.field("x")),
np.asarray(arr.field("y")),
np.asarray(arr.field("z")),
]
)
return coords
else:
# fixed size list
return np.asarray(arr.values).reshape(len(arr), -1)
def construct_shapely_array(arr: pa.Array, extension_name: str):
"""
Construct a NumPy array of shapely geometries from a pyarrow.Array
with GeoArrow extension type.
"""
if isinstance(arr, pa.ExtensionArray):
arr = arr.storage
if extension_name == "geoarrow.point":
coords = _get_inner_coords(arr)
result = shapely.from_ragged_array(GeometryType.POINT, coords, None)
elif extension_name == "geoarrow.linestring":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.LINESTRING, coords, offsets)
elif extension_name == "geoarrow.polygon":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(GeometryType.POLYGON, coords, offsets)
elif extension_name == "geoarrow.multipoint":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.MULTIPOINT, coords, offsets)
elif extension_name == "geoarrow.multilinestring":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(
GeometryType.MULTILINESTRING, coords, offsets
)
elif extension_name == "geoarrow.multipolygon":
coords = _get_inner_coords(arr.values.values.values)
offsets3 = np.asarray(arr.offsets)
offsets2 = np.asarray(arr.values.offsets)
offsets1 = np.asarray(arr.values.values.offsets)
offsets = (offsets1, offsets2, offsets3)
result = shapely.from_ragged_array(GeometryType.MULTIPOLYGON, coords, offsets)
else:
raise ValueError(extension_name)
# apply validity mask
if arr.null_count:
mask = np.asarray(arr.is_null())
result = np.where(mask, None, result)
return result
@@ -0,0 +1,72 @@
from packaging.version import Version
import pyarrow
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
for more details.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if Version(pyarrow.__version__) >= Version("14.0.1"):
return
# if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
# installed, use this instead (which also ensures it works if they had
# called `pyarrow_hotfix.uninstall()`)
try:
import pyarrow_hotfix # noqa: F401
except ImportError:
pass
else:
return
# if the hotfix is already installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()
@@ -0,0 +1,913 @@
import json
import warnings
from packaging.version import Version
import numpy as np
from pandas import DataFrame, Series
import shapely
import geopandas
from geopandas import GeoDataFrame
from geopandas._compat import import_optional_dependency
from geopandas.array import from_shapely, from_wkb
from .file import _expand_user
METADATA_VERSION = "1.0.0"
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
# reference: https://github.com/opengeospatial/geoparquet
# Metadata structure:
# {
# "geo": {
# "columns": {
# "<name>": {
# "encoding": "WKB"
# "geometry_types": <list of str: REQUIRED>
# "crs": "<PROJJSON or None: OPTIONAL>",
# "orientation": "<'counterclockwise' or None: OPTIONAL>"
# "edges": "planar"
# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
# "epoch": <float: OPTIONAL>
# }
# },
# "primary_column": "<str: REQUIRED>",
# "version": "<METADATA_VERSION>",
#
# # Additional GeoPandas specific metadata (not in metadata spec)
# "creator": {
# "library": "geopandas",
# "version": "<geopandas.__version__>"
# }
# }
# }
def _is_fsspec_url(url):
return (
isinstance(url, str)
and "://" in url
and not url.startswith(("http://", "https://"))
)
def _remove_id_from_member_of_ensembles(json_dict):
"""
Older PROJ versions will not recognize IDs of datum ensemble members that
were added in more recent PROJ database versions.
Cf https://github.com/opengeospatial/geoparquet/discussions/110
and https://github.com/OSGeo/PROJ/pull/3221
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
"""
for key, value in json_dict.items():
if isinstance(value, dict):
_remove_id_from_member_of_ensembles(value)
elif key == "members" and isinstance(value, list):
for member in value:
member.pop("id", None)
# type ids 0 to 7
_geometry_type_names = [
"Point",
"LineString",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
]
_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
def _get_geometry_types(series):
"""
Get unique geometry types from a GeoSeries.
"""
arr_geometry_types = shapely.get_type_id(series.array._data)
# ensure to include "... Z" for 3D geometries
has_z = shapely.has_z(series.array._data)
arr_geometry_types[has_z] += 8
geometry_types = Series(arr_geometry_types).unique().tolist()
# drop missing values (shapely.get_type_id returns -1 for those)
if -1 in geometry_types:
geometry_types.remove(-1)
return sorted([_geometry_type_names[idx] for idx in geometry_types])
def _create_metadata(
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
):
"""Create and encode geo metadata dict.
Parameters
----------
df : GeoDataFrame
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
Returns
-------
dict
"""
if schema_version is None:
if geometry_encoding and any(
encoding != "WKB" for encoding in geometry_encoding.values()
):
schema_version = "1.1.0"
else:
schema_version = METADATA_VERSION
if schema_version not in SUPPORTED_VERSIONS:
raise ValueError(
f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
)
# Construct metadata for each geometry
column_metadata = {}
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
geometry_types = _get_geometry_types(series)
if schema_version[0] == "0":
geometry_types_name = "geometry_type"
if len(geometry_types) == 1:
geometry_types = geometry_types[0]
else:
geometry_types_name = "geometry_types"
crs = None
if series.crs:
if schema_version == "0.1.0":
crs = series.crs.to_wkt()
else: # version >= 0.4.0
crs = series.crs.to_json_dict()
_remove_id_from_member_of_ensembles(crs)
column_metadata[col] = {
"encoding": geometry_encoding[col],
"crs": crs,
geometry_types_name: geometry_types,
}
bbox = series.total_bounds.tolist()
if np.isfinite(bbox).all():
# don't add bbox with NaNs for empty / all-NA geometry column
column_metadata[col]["bbox"] = bbox
if write_covering_bbox:
column_metadata[col]["covering"] = {
"bbox": {
"xmin": ["bbox", "xmin"],
"ymin": ["bbox", "ymin"],
"xmax": ["bbox", "xmax"],
"ymax": ["bbox", "ymax"],
},
}
return {
"primary_column": df._geometry_column_name,
"columns": column_metadata,
"version": schema_version,
"creator": {"library": "geopandas", "version": geopandas.__version__},
}
def _encode_metadata(metadata):
"""Encode metadata dict to UTF-8 JSON string
Parameters
----------
metadata : dict
Returns
-------
UTF-8 encoded JSON string
"""
return json.dumps(metadata).encode("utf-8")
def _decode_metadata(metadata_str):
"""Decode a UTF-8 encoded JSON string to dict
Parameters
----------
metadata_str : string (UTF-8 encoded)
Returns
-------
dict
"""
if metadata_str is None:
return None
return json.loads(metadata_str.decode("utf-8"))
def _validate_dataframe(df):
"""Validate that the GeoDataFrame conforms to requirements for writing
to Parquet format.
Raises `ValueError` if the GeoDataFrame is not valid.
copied from `pandas.io.parquet`
Parameters
----------
df : GeoDataFrame
"""
if not isinstance(df, DataFrame):
raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
raise ValueError("Writing to Parquet/Feather requires string column names")
# index level names must be strings
valid_names = all(
isinstance(name, str) for name in df.index.names if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def _validate_geo_metadata(metadata):
"""Validate geo metadata.
Must not be empty, and must contain the structure specified above.
Raises ValueError if metadata is not valid.
Parameters
----------
metadata : dict
"""
if not metadata:
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
# version was schema_version in 0.1.0
version = metadata.get("version", metadata.get("schema_version"))
if not version:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'version'"
)
required_keys = ("primary_column", "columns")
for key in required_keys:
if metadata.get(key, None) is None:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'{key}'".format(key=key)
)
if not isinstance(metadata["columns"], dict):
raise ValueError("'columns' in 'geo' metadata must be a dict")
# Validate that geometry columns have required metadata and values
# leaving out "geometry_type" for compatibility with 0.1
required_col_keys = ("encoding",)
for col, column_metadata in metadata["columns"].items():
for key in required_col_keys:
if key not in column_metadata:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key "
"'{key}' for column '{col}'".format(key=key, col=col)
)
if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
raise ValueError(
"Only WKB geometry encoding or one of the native encodings "
f"({GEOARROW_ENCODINGS!r}) are supported, "
f"got: {column_metadata['encoding']}"
)
if column_metadata.get("edges", "planar") == "spherical":
warnings.warn(
f"The geo metadata indicate that column '{col}' has spherical edges, "
"but because GeoPandas currently does not support spherical "
"geometry, it ignores this metadata and will interpret the edges of "
"the geometries as planar.",
UserWarning,
stacklevel=4,
)
if "covering" in column_metadata:
covering = column_metadata["covering"]
if "bbox" in covering:
bbox = covering["bbox"]
for var in ["xmin", "ymin", "xmax", "ymax"]:
if var not in bbox.keys():
raise ValueError("Metadata for bbox column is malformed.")
def _geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=None,
):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
from pyarrow import StructArray
from geopandas.io._geoarrow import geopandas_to_arrow
_validate_dataframe(df)
if schema_version is not None:
if geometry_encoding != "WKB" and schema_version != "1.1.0":
raise ValueError(
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
)
table, geometry_encoding_dict = geopandas_to_arrow(
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
)
geo_metadata = _create_metadata(
df,
schema_version=schema_version,
geometry_encoding=geometry_encoding_dict,
write_covering_bbox=write_covering_bbox,
)
if write_covering_bbox:
if "bbox" in df.columns:
raise ValueError(
"An existing column 'bbox' already exists in the dataframe. "
"Please rename to write covering bbox."
)
bounds = df.bounds
bbox_array = StructArray.from_arrays(
[bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
names=["xmin", "ymin", "xmax", "ymax"],
)
table = table.append_column("bbox", bbox_array)
# Store geopandas specific file-level metadata
# This must be done AFTER creating the table or it is not persisted
metadata = table.schema.metadata
metadata.update({b"geo": _encode_metadata(geo_metadata)})
return table.replace_schema_metadata(metadata)
def _to_parquet(
df,
path,
index=None,
compression="snappy",
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=False,
**kwargs,
):
"""
Write a GeoDataFrame to the Parquet format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow'.
This is tracking version 1.0.0 of the GeoParquet specification at:
https://github.com/opengeospatial/geoparquet. Writing older versions is
supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
The encoding to use for the geometry columns. Defaults to "WKB"
for maximum interoperability. Specify "geoarrow" to use one of the
native GeoArrow-based single-geometry type encodings.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
**kwargs
Additional keyword arguments passed to pyarrow.parquet.write_table().
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
path = _expand_user(path)
table = _geopandas_to_arrow(
df,
index=index,
geometry_encoding=geometry_encoding,
schema_version=schema_version,
write_covering_bbox=write_covering_bbox,
)
parquet.write_table(table, path, compression=compression, **kwargs)
def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
"""
Write a GeoDataFrame to the Feather format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow' >= 0.17.
This is tracking version 1.0.0 of the GeoParquet specification for
the metadata at: https://github.com/opengeospatial/geoparquet. Writing
older versions is supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'zstd', 'lz4', 'uncompressed'}, optional
Name of the compression to use. Use ``"uncompressed"`` for no
compression. By default uses LZ4 if available, otherwise uncompressed.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version for the metadata; if not provided
will default to latest supported version.
kwargs
Additional keyword arguments passed to pyarrow.feather.write_feather().
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
feather.write_feather(table, path, compression=compression, **kwargs)
def _arrow_to_geopandas(table, geo_metadata=None):
"""
Helper function with main, shared logic for read_parquet/read_feather.
"""
if geo_metadata is None:
# Note: this path of not passing metadata is also used by dask-geopandas
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
# Find all geometry columns that were read from the file. May
# be a subset if 'columns' parameter is used.
geometry_columns = [
col for col in geo_metadata["columns"] if col in table.column_names
]
result_column_names = list(table.slice(0, 0).to_pandas().columns)
geometry_columns.sort(key=result_column_names.index)
if not len(geometry_columns):
raise ValueError(
"""No geometry columns are included in the columns read from
the Parquet/Feather file. To read this file without geometry columns,
use pandas.read_parquet/read_feather() instead."""
)
geometry = geo_metadata["primary_column"]
# Missing geometry likely indicates a subset of columns was read;
# promote the first available geometry to the primary geometry.
if len(geometry_columns) and geometry not in geometry_columns:
geometry = geometry_columns[0]
# if there are multiple non-primary geometry columns, raise a warning
if len(geometry_columns) > 1:
warnings.warn(
"Multiple non-primary geometry columns read from Parquet/Feather "
"file. The first column read was promoted to the primary geometry.",
stacklevel=3,
)
table_attr = table.drop(geometry_columns)
df = table_attr.to_pandas()
# Convert the WKB columns that are present back to geometry.
for col in geometry_columns:
col_metadata = geo_metadata["columns"][col]
if "crs" in col_metadata:
crs = col_metadata["crs"]
if isinstance(crs, dict):
_remove_id_from_member_of_ensembles(crs)
else:
# per the GeoParquet spec, missing CRS is to be interpreted as
# OGC:CRS84
crs = "OGC:CRS84"
if col_metadata["encoding"] == "WKB":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
else:
from geopandas.io._geoarrow import construct_shapely_array
geom_arr = from_shapely(
construct_shapely_array(
table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
),
crs=crs,
)
df.insert(result_column_names.index(col), col, geom_arr)
return GeoDataFrame(df, geometry=geometry)
def _get_filesystem_path(path, filesystem=None, storage_options=None):
"""
Get the filesystem and path for a given filesystem and path.
If the filesystem is not None then it's just returned as is.
"""
import pyarrow
if (
isinstance(path, str)
and storage_options is None
and filesystem is None
and Version(pyarrow.__version__) >= Version("5.0.0")
):
# Use the native pyarrow filesystem if possible.
try:
from pyarrow.fs import FileSystem
filesystem, path = FileSystem.from_uri(path)
except Exception:
# fallback to use get_handle / fsspec for filesystems
# that pyarrow doesn't support
pass
if _is_fsspec_url(path) and filesystem is None:
fsspec = import_optional_dependency(
"fsspec", extra="fsspec is requred for 'storage_options'."
)
filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
if filesystem is None and storage_options:
raise ValueError(
"Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
)
return filesystem, path
def _ensure_arrow_fs(filesystem):
"""
Simplified version of pyarrow.fs._ensure_filesystem. This is only needed
below because `pyarrow.parquet.read_metadata` does not yet accept a
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
"""
from pyarrow import fs
if isinstance(filesystem, fs.FileSystem):
return filesystem
# handle fsspec-compatible filesystems
try:
import fsspec
except ImportError:
pass
else:
if isinstance(filesystem, fsspec.AbstractFileSystem):
return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
return filesystem
def _validate_and_decode_metadata(metadata):
if metadata is None or b"geo" not in metadata:
raise ValueError(
"""Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead."""
)
# check for malformed metadata
try:
decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
except (TypeError, json.decoder.JSONDecodeError):
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
_validate_geo_metadata(decoded_geo_metadata)
return decoded_geo_metadata
def _read_parquet_schema_and_metadata(path, filesystem):
"""
Opening the Parquet file/dataset a first time to get the schema and metadata.
TODO: we should look into how we can reuse opened dataset for reading the
actual data, to avoid discovering the dataset twice (problem right now is
that the ParquetDataset interface doesn't allow passing the filters on read)
"""
import pyarrow
from pyarrow import parquet
kwargs = {}
if Version(pyarrow.__version__) < Version("15.0.0"):
kwargs = dict(use_legacy_dataset=False)
try:
schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
except Exception:
schema = parquet.read_schema(path, filesystem=filesystem)
metadata = schema.metadata
# read metadata separately to get the raw Parquet FileMetaData metadata
# (pyarrow doesn't properly exposes those in schema.metadata for files
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
if metadata is None or b"geo" not in metadata:
try:
metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
except Exception:
pass
return schema, metadata
def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
"""
Load a Parquet object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_parquet` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the GeoParquet metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow'.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g. host,
port, username, password, etc. For HTTP(S) URLs the key-value pairs are
forwarded to urllib as header options. For other URLs (e.g. starting with
"s3://", and "gcs://") the key-value pairs are forwarded to fsspec. Please
see fsspec and urllib for more details.
When no storage options are provided and a filesystem is implemented by
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
filesystem is preferred. Provide the instantiated fsspec filesystem using
the ``filesystem`` keyword if you wish to use its implementation.
bbox : tuple, optional
Bounding box to be used to filter selection from geoparquet data. This
is only usable if the data was saved with the bbox covering metadata.
Input is of the tuple format (xmin, ymin, xmax, ymax).
**kwargs
Any additional kwargs passed to :func:`pyarrow.parquet.read_table`.
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_parquet("data.parquet") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_parquet(
... "data.parquet",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
import geopandas.io._pyarrow_hotfix # noqa: F401
# TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
# adds filesystem as a keyword and match that.
filesystem = kwargs.pop("filesystem", None)
filesystem, path = _get_filesystem_path(
path, filesystem=filesystem, storage_options=storage_options
)
path = _expand_user(path)
schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
geo_metadata = _validate_and_decode_metadata(metadata)
bbox_filter = (
_get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
)
if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
# by default, bbox column is not read in, so must specify which
# columns are read in if it exists.
if not columns and if_bbox_column_exists:
columns = _get_non_bbox_columns(schema, geo_metadata)
# if both bbox and filters kwargs are used, must splice together.
if "filters" in kwargs:
filters_kwarg = kwargs.pop("filters")
filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
else:
filters = bbox_filter
kwargs["use_pandas_metadata"] = True
table = parquet.read_table(
path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
)
return _arrow_to_geopandas(table, geo_metadata)
def _read_feather(path, columns=None, **kwargs):
"""
Load a Feather object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_feather` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the Feather metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow' >= 0.17.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
**kwargs
Any additional kwargs passed to pyarrow.feather.read_table().
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_feather("data.feather") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_feather(
... "data.feather",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
import geopandas.io._pyarrow_hotfix # noqa: F401
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = feather.read_table(path, columns=columns, **kwargs)
return _arrow_to_geopandas(table)
def _get_parquet_bbox_filter(geo_metadata, bbox):
primary_column = geo_metadata["primary_column"]
if _check_if_covering_in_geo_metadata(geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
elif geo_metadata["columns"][primary_column]["encoding"] == "point":
import pyarrow.compute as pc
return (
(pc.field((primary_column, "x")) >= bbox[0])
& (pc.field((primary_column, "x")) <= bbox[2])
& (pc.field((primary_column, "y")) >= bbox[1])
& (pc.field((primary_column, "y")) <= bbox[3])
)
else:
raise ValueError(
"Specifying 'bbox' not supported for this Parquet file (it should either "
"have a bbox covering column or use 'point' encoding)."
)
def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
import pyarrow.compute as pc
return ~(
(pc.field((bbox_column_name, "xmin")) > bbox[2])
| (pc.field((bbox_column_name, "ymin")) > bbox[3])
| (pc.field((bbox_column_name, "xmax")) < bbox[0])
| (pc.field((bbox_column_name, "ymax")) < bbox[1])
)
def _check_if_covering_in_geo_metadata(geo_metadata):
primary_column = geo_metadata["primary_column"]
return "covering" in geo_metadata["columns"][primary_column].keys()
def _get_bbox_encoding_column_name(geo_metadata):
primary_column = geo_metadata["primary_column"]
return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
def _get_non_bbox_columns(schema, geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
columns = schema.names
if bbox_column_name in columns:
columns.remove(bbox_column_name)
return columns
def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
if bbox_filter is None:
return kwarg_filters
filters_expression = parquet.filters_to_expression(kwarg_filters)
return bbox_filter & filters_expression
@@ -0,0 +1,851 @@
from __future__ import annotations
import os
import urllib.request
import warnings
from io import IOBase
from packaging.version import Version
from pathlib import Path
# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import shapely
from shapely.geometry import mapping
from shapely.geometry.base import BaseGeometry
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
from geopandas.io.util import vsi_path
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
_VALID_URLS.discard("file")
fiona = None
fiona_env = None
fiona_import_error = None
FIONA_GE_19 = False
def _import_fiona():
global fiona
global fiona_env
global fiona_import_error
global FIONA_GE_19
if fiona is None:
try:
import fiona
# only try to import fiona.Env if the main fiona import succeeded
# (otherwise you can get confusing "AttributeError: module 'fiona'
# has no attribute '_loading'" / partially initialized module errors)
try:
from fiona import Env as fiona_env
except ImportError:
try:
from fiona import drivers as fiona_env
except ImportError:
fiona_env = None
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
"1.9.0"
)
except ImportError as err:
fiona = False
fiona_import_error = str(err)
pyogrio = None
pyogrio_import_error = None
def _import_pyogrio():
global pyogrio
global pyogrio_import_error
if pyogrio is None:
try:
import pyogrio
except ImportError as err:
pyogrio = False
pyogrio_import_error = str(err)
def _check_fiona(func):
if not fiona:
raise ImportError(
f"the {func} requires the 'fiona' package, but it is not installed or does "
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
)
def _check_pyogrio(func):
if not pyogrio:
raise ImportError(
f"the {func} requires the 'pyogrio' package, but it is not installed "
"or does not import correctly."
"\nImporting pyogrio resulted in: {pyogrio_import_error}"
)
def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
if metadata is None:
return
if driver != "GPKG":
raise NotImplementedError(
"The 'metadata' keyword is only supported for the GPKG driver."
)
if engine == "fiona" and not FIONA_GE_19:
raise NotImplementedError(
"The 'metadata' keyword is only supported for Fiona >= 1.9."
)
def _check_engine(engine, func):
# if not specified through keyword or option, then default to "pyogrio" if
# installed, otherwise try fiona
if engine is None:
import geopandas
engine = geopandas.options.io_engine
if engine is None:
_import_pyogrio()
if pyogrio:
engine = "pyogrio"
else:
_import_fiona()
if fiona:
engine = "fiona"
if engine == "pyogrio":
_import_pyogrio()
_check_pyogrio(func)
elif engine == "fiona":
_import_fiona()
_check_fiona(func)
elif engine is None:
raise ImportError(
f"The {func} requires the 'pyogrio' or 'fiona' package, "
"but neither is installed or imports correctly."
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
f"\nImporting fiona resulted in: {fiona_import_error}"
)
return engine
_EXTENSION_TO_DRIVER = {
".bna": "BNA",
".dxf": "DXF",
".csv": "CSV",
".shp": "ESRI Shapefile",
".dbf": "ESRI Shapefile",
".json": "GeoJSON",
".geojson": "GeoJSON",
".geojsonl": "GeoJSONSeq",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".gml": "GML",
".xml": "GML",
".gpx": "GPX",
".gtm": "GPSTrackMaker",
".gtz": "GPSTrackMaker",
".tab": "MapInfo File",
".mif": "MapInfo File",
".mid": "MapInfo File",
".dgn": "DGN",
".fgb": "FlatGeobuf",
}
def _expand_user(path):
"""Expand paths that use ~."""
if isinstance(path, str):
path = os.path.expanduser(path)
elif isinstance(path, Path):
path = path.expanduser()
return path
def _is_url(url):
"""Check to see if *url* has a valid protocol."""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def _read_file(
filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
):
"""
Returns a GeoDataFrame from a file or URL.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely
geometry. With engine="fiona", CRS mis-matches are resolved if given a GeoSeries
or GeoDataFrame. With engine="pyogrio", bbox must be in the same CRS as the
dataset. Tuple is (minx, miny, maxx, maxy) to match the bounds property of
shapely geometry objects. Cannot be used with mask.
mask : dict | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter for features that intersect with the given dict-like geojson
geometry, GeoSeries, GeoDataFrame or shapely geometry.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
Cannot be used with bbox. If multiple geometries are passed, this will
first union all geometries, which may be computationally expensive.
columns : list, optional
List of column names to import from the data source. Column names
must exactly match the names in the data source. To avoid reading
any columns (besides the geometry column), pass an empty list-like.
By default reads all columns.
rows : int or slice, default None
Load in specific rows by passing an integer (first `n` rows) or a
slice() object.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "pyogrio" engine, the keyword arguments are passed to
`pyogrio.write_dataframe`. In case of the "fiona" engine, the keyword
arguments are passed to fiona.open`. For more information on possible
keywords, type: ``import pyogrio; help(pyogrio.write_dataframe)``.
Examples
--------
>>> df = geopandas.read_file("nybb.shp") # doctest: +SKIP
Specifying layer of GPKG:
>>> df = geopandas.read_file("file.gpkg", layer='cities') # doctest: +SKIP
Reading only first 10 rows:
>>> df = geopandas.read_file("nybb.shp", rows=10) # doctest: +SKIP
Reading only geometries intersecting ``mask``:
>>> df = geopandas.read_file("nybb.shp", mask=polygon) # doctest: +SKIP
Reading only geometries intersecting ``bbox``:
>>> df = geopandas.read_file("nybb.shp", bbox=(0, 0, 10, 20)) # doctest: +SKIP
Returns
-------
:obj:`geopandas.GeoDataFrame` or :obj:`pandas.DataFrame` :
If `ignore_geometry=True` a :obj:`pandas.DataFrame` will be returned.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
When specifying a URL, geopandas will check if the server supports reading
partial data and in that case pass the URL as is to the underlying engine,
which will then use the network file system handler of GDAL to read from
the URL. Otherwise geopandas will download the data from the URL and pass
all data in-memory to the underlying engine.
If you need more control over how the URL is read, you can specify the
GDAL virtual filesystem manually (e.g. ``/vsicurl/https://...``). See the
GDAL documentation on filesystems for more details
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
"""
engine = _check_engine(engine, "'read_file' function")
filename = _expand_user(filename)
from_bytes = False
if _is_url(filename):
# if it is a url that supports random access -> pass through to
# pyogrio/fiona as is (to support downloading only part of the file)
# otherwise still download manually because pyogrio/fiona don't support
# all types of urls (https://github.com/geopandas/geopandas/issues/2908)
with urllib.request.urlopen(filename) as response:
if not response.headers.get("Accept-Ranges") == "bytes":
filename = response.read()
from_bytes = True
if engine == "pyogrio":
return _read_file_pyogrio(
filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
)
elif engine == "fiona":
if pd.api.types.is_file_like(filename):
data = filename.read()
path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
from_bytes = True
else:
path_or_bytes = filename
return _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=bbox,
mask=mask,
columns=columns,
rows=rows,
**kwargs,
)
else:
raise ValueError(f"unknown engine '{engine}'")
def _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=None,
mask=None,
columns=None,
rows=None,
where=None,
**kwargs,
):
if where is not None and not FIONA_GE_19:
raise NotImplementedError("where requires fiona 1.9+")
if columns is not None:
if "include_fields" in kwargs:
raise ValueError(
"Cannot specify both 'include_fields' and 'columns' keywords"
)
if not FIONA_GE_19:
raise NotImplementedError("'columns' keyword requires fiona 1.9+")
kwargs["include_fields"] = columns
elif "include_fields" in kwargs:
# alias to columns, as this variable is used below to specify column order
# in the dataframe creation
columns = kwargs["include_fields"]
if not from_bytes:
# Opening a file via URL or file-like-object above automatically detects a
# zipped file. In order to match that behavior, attempt to add a zip scheme
# if missing.
path_or_bytes = vsi_path(str(path_or_bytes))
if from_bytes:
reader = fiona.BytesCollection
else:
reader = fiona.open
with fiona_env():
with reader(path_or_bytes, **kwargs) as features:
crs = features.crs_wkt
# attempt to get EPSG code
try:
# fiona 1.9+
epsg = features.crs.to_epsg(confidence_threshold=100)
if epsg is not None:
crs = epsg
except AttributeError:
# fiona <= 1.8
try:
crs = features.crs["init"]
except (TypeError, KeyError):
pass
# handle loading the bounding box
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
assert len(bbox) == 4
# handle loading the mask
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
mask = mapping(mask.to_crs(crs).union_all())
elif isinstance(mask, BaseGeometry):
mask = mapping(mask)
filters = {}
if bbox is not None:
filters["bbox"] = bbox
if mask is not None:
filters["mask"] = mask
if where is not None:
filters["where"] = where
# setup the data loading filter
if rows is not None:
if isinstance(rows, int):
rows = slice(rows)
elif not isinstance(rows, slice):
raise TypeError("'rows' must be an integer or a slice.")
f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
elif filters:
f_filt = features.filter(**filters)
else:
f_filt = features
# get list of columns
columns = columns or list(features.schema["properties"])
datetime_fields = [
k for (k, v) in features.schema["properties"].items() if v == "datetime"
]
if (
kwargs.get("ignore_geometry", False)
or features.schema["geometry"] == "None"
):
df = pd.DataFrame(
[record["properties"] for record in f_filt], columns=columns
)
else:
df = GeoDataFrame.from_features(
f_filt, crs=crs, columns=columns + ["geometry"]
)
for k in datetime_fields:
as_dt = None
# plain try catch for when pandas will raise in the future
# TODO we can tighten the exception type in future when it does
try:
with warnings.catch_warnings():
# pandas 2.x does not yet enforce this behaviour but raises a
# warning -> we want to to suppress this warning for our users,
# and do this by turning it into an error so we take the
# `except` code path to try again with utc=True
warnings.filterwarnings(
"error",
"In a future version of pandas, parsing datetimes with "
"mixed time zones will raise an error",
FutureWarning,
)
as_dt = pd.to_datetime(df[k])
except Exception:
pass
if as_dt is None or as_dt.dtype == "object":
# if to_datetime failed, try again for mixed timezone offsets
# This can still fail if there are invalid datetimes
try:
as_dt = pd.to_datetime(df[k], utc=True)
except Exception:
pass
# if to_datetime succeeded, round datetimes as
# fiona only supports up to ms precision (any microseconds are
# floating point rounding error)
if as_dt is not None and not (as_dt.dtype == "object"):
if PANDAS_GE_20:
df[k] = as_dt.dt.as_unit("ms")
else:
df[k] = as_dt.dt.round(freq="ms")
return df
def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
import pyogrio
if rows is not None:
if isinstance(rows, int):
kwargs["max_features"] = rows
elif isinstance(rows, slice):
if rows.start is not None:
if rows.start < 0:
raise ValueError(
"Negative slice start not supported with the 'pyogrio' engine."
)
kwargs["skip_features"] = rows.start
if rows.stop is not None:
kwargs["max_features"] = rows.stop - (rows.start or 0)
if rows.step is not None:
raise ValueError("slice with step is not supported")
else:
raise TypeError("'rows' must be an integer or a slice.")
if bbox is not None and mask is not None:
# match error message from Fiona
raise ValueError("mask and bbox can not be set together")
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
if len(bbox) != 4:
raise ValueError("'bbox' should be a length-4 tuple.")
if mask is not None:
# NOTE: mask cannot be used at same time as bbox keyword
if isinstance(mask, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
elif isinstance(mask, BaseGeometry):
mask = shapely.unary_union(mask)
elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
# convert GeoJSON to shapely geometry
mask = shapely.geometry.shape(mask)
kwargs["mask"] = mask
if kwargs.pop("ignore_geometry", False):
kwargs["read_geometry"] = False
# translate `ignore_fields`/`include_fields` keyword for back compat with fiona
if "ignore_fields" in kwargs and "include_fields" in kwargs:
raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
elif "ignore_fields" in kwargs:
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'ignore_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
ignore_fields = kwargs.pop("ignore_fields")
fields = pyogrio.read_info(path_or_bytes)["fields"]
include_fields = [col for col in fields if col not in ignore_fields]
kwargs["columns"] = include_fields
elif "include_fields" in kwargs:
# translate `include_fields` keyword for back compat with fiona engine
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'include_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
kwargs["columns"] = kwargs.pop("include_fields")
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
def _detect_driver(path):
"""
Attempt to auto-detect driver based on the extension
"""
try:
# in case the path is a file handle
path = path.name
except AttributeError:
pass
try:
return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
except KeyError:
# Assume it is a shapefile folder for now. In the future,
# will likely raise an exception when the expected
# folder writing behavior is more clearly defined.
return "ESRI Shapefile"
def _to_file(
df,
filename,
driver=None,
schema=None,
index=None,
mode="w",
crs=None,
engine=None,
metadata=None,
**kwargs,
):
"""
Write this GeoDataFrame to an OGR data source
A dictionary of supported OGR providers is available via:
>>> import pyogrio
>>> pyogrio.list_drivers() # doctest: +SKIP
Parameters
----------
df : GeoDataFrame to be written
filename : string
File path or file handle to write to. The path may specify a
GDAL VSI scheme.
driver : string, default None
The OGR format driver used to write the vector file.
If not specified, it attempts to infer it from the file extension.
If no extension is specified, it saves ESRI Shapefile to a folder.
schema : dict, default None
If specified, the schema dictionary is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the schema based on each column's dtype.
Not supported for the "pyogrio" engine.
index : bool, default None
If True, write index into one or more columns (for MultiIndex).
Default None writes the index into one or more columns only if
the index is named, is a MultiIndex, or has a non-integer data
type. If False, no index is written.
.. versionadded:: 0.7
Previously the index was not written.
mode : string, default 'w'
The write mode, 'w' to overwrite the existing file and 'a' to append;
when using the pyogrio engine, you can also pass ``append=True``.
Not all drivers support appending. For the fiona engine, the drivers
that support appending are listed in fiona.supported_drivers or
https://github.com/Toblerity/Fiona/blob/master/fiona/drvsupport.py.
For the pyogrio engine, you should be able to use any driver that
is available in your installation of GDAL that supports append
capability; see the specific driver entry at
https://gdal.org/drivers/vector/index.html for more information.
crs : pyproj.CRS, default None
If specified, the CRS is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the crs based on crs df attribute.
The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
metadata : dict[str, str], default None
Optional metadata to be stored in the file. Keys and values must be
strings. Only supported for the "GPKG" driver
(requires Fiona >= 1.9 or pyogrio >= 0.6).
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "fiona" engine, the keyword arguments are passed to
fiona.open`. For more information on possible keywords, type:
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
the keyword arguments are passed to `pyogrio.write_dataframe`.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
"""
engine = _check_engine(engine, "'to_file' method")
filename = _expand_user(filename)
if index is None:
# Determine if index attribute(s) should be saved to file
# (only if they are named or are non-integer)
index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
if index:
df = df.reset_index(drop=False)
if driver is None:
driver = _detect_driver(filename)
if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
warnings.warn(
"Column names longer than 10 characters will be truncated when saved to "
"ESRI Shapefile.",
stacklevel=3,
)
if (df.dtypes == "geometry").sum() > 1:
raise ValueError(
"GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
"supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
"GeoDataFrame.to_feather, drop additional geometry columns or convert them "
"to a supported format like a well-known text (WKT) using "
"`GeoSeries.to_wkt()`.",
)
_check_metadata_supported(metadata, engine, driver)
if mode not in ("w", "a"):
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
if engine == "pyogrio":
_to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
elif engine == "fiona":
_to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
else:
raise ValueError(f"unknown engine '{engine}'")
def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
if not HAS_PYPROJ and crs:
raise ImportError(
"The 'pyproj' package is required to write a file with a CRS, but it is not"
" installed or does not import correctly."
)
if schema is None:
schema = infer_schema(df)
if crs:
from pyproj import CRS
crs = CRS.from_user_input(crs)
else:
crs = df.crs
with fiona_env():
crs_wkt = None
try:
gdal_version = Version(
fiona.env.get_gdal_release_name().strip("e")
) # GH3147
except (AttributeError, ValueError):
gdal_version = Version("2.0.0") # just assume it is not the latest
if gdal_version >= Version("3.0.0") and crs:
crs_wkt = crs.to_wkt()
elif crs:
crs_wkt = crs.to_wkt("WKT1_GDAL")
with fiona.open(
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
) as colxn:
if metadata is not None:
colxn.update_tags(metadata)
colxn.writerecords(df.iterfeatures())
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
import pyogrio
if schema is not None:
raise ValueError(
"The 'schema' argument is not supported with the 'pyogrio' engine."
)
if mode == "a":
kwargs["append"] = True
if crs is not None:
raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
# for the fiona engine, this check is done in gdf.iterfeatures()
if not df.columns.is_unique:
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
def infer_schema(df):
from collections import OrderedDict
# TODO: test pandas string type and boolean type once released
types = {
"Int32": "int32",
"int32": "int32",
"Int64": "int",
"string": "str",
"boolean": "bool",
}
def convert_type(column, in_type):
if in_type == object:
return "str"
if in_type.name.startswith("datetime64"):
# numpy datetime type regardless of frequency
return "datetime"
if str(in_type) in types:
out_type = types[str(in_type)]
else:
out_type = type(np.zeros(1, in_type).item()).__name__
if out_type == "long":
out_type = "int"
return out_type
properties = OrderedDict(
[
(col, convert_type(col, _type))
for col, _type in zip(df.columns, df.dtypes)
if col != df._geometry_column_name
]
)
if df.empty:
warnings.warn(
"You are attempting to write an empty DataFrame to file. "
"For some drivers, this operation may fail.",
UserWarning,
stacklevel=3,
)
# Since https://github.com/Toblerity/Fiona/issues/446 resolution,
# Fiona allows a list of geometry types
geom_types = _geometry_types(df)
schema = {"geometry": geom_types, "properties": properties}
return schema
def _geometry_types(df):
"""
Determine the geometry types in the GeoDataFrame for the schema.
"""
geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
geom_types = geom_types_3D + geom_types_2D
if len(geom_types) == 0:
# Default geometry type supported by Fiona
# (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
return "Unknown"
if len(geom_types) == 1:
geom_types = geom_types[0]
return geom_types
def _list_layers(filename) -> pd.DataFrame:
"""List layers available in a file.
Provides an overview of layers available in a file or URL together with their
geometry types. When supported by the data source, this includes both spatial and
non-spatial layers. Non-spatial layers are indicated by the ``"geometry_type"``
column being ``None``. GeoPandas will not read such layers but they can be read into
a pd.DataFrame using :func:`pyogrio.read_dataframe`.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
Returns
-------
pandas.DataFrame
A DataFrame with columns "name" and "geometry_type" and one row per layer.
"""
_import_pyogrio()
_check_pyogrio("list_layers")
import pyogrio
return pd.DataFrame(
pyogrio.list_layers(filename), columns=["name", "geometry_type"]
)
@@ -0,0 +1,473 @@
import warnings
from contextlib import contextmanager
from functools import lru_cache
import pandas as pd
import shapely
import shapely.wkb
from geopandas import GeoDataFrame
@contextmanager
def _get_conn(conn_or_engine):
"""
Yield a connection within a transaction context.
Engine.begin() returns a Connection with an implicit Transaction while
Connection.begin() returns the Transaction. This helper will always return a
Connection with an implicit (possibly nested) Transaction.
Parameters
----------
conn_or_engine : Connection or Engine
A sqlalchemy Connection or Engine instance
Returns
-------
Connection
"""
from sqlalchemy.engine.base import Connection, Engine
if isinstance(conn_or_engine, Connection):
if not conn_or_engine.in_transaction():
with conn_or_engine.begin():
yield conn_or_engine
else:
yield conn_or_engine
elif isinstance(conn_or_engine, Engine):
with conn_or_engine.begin() as conn:
yield conn
else:
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
"""
Transforms a pandas DataFrame into a GeoDataFrame.
The column 'geom_col' must be a geometry column in WKB representation.
To be used to convert df based on pd.read_sql to gdf.
Parameters
----------
df : DataFrame
pandas DataFrame with geometry column in WKB representation.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : pyproj.CRS, optional
CRS to use for the returned GeoDataFrame. The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
If not set, tries to determine CRS from the SRID associated with the
first geometry in the database, and assigns that to all geometries.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
Returns
-------
GeoDataFrame
"""
if geom_col not in df:
raise ValueError("Query missing geometry column '{}'".format(geom_col))
if df.columns.to_list().count(geom_col) > 1:
raise ValueError(
f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
"one geometry column is allowed."
)
geoms = df[geom_col].dropna()
if not geoms.empty:
load_geom_bytes = shapely.wkb.loads
"""Load from Python 3 binary."""
def load_geom_text(x):
"""Load from binary encoded as text."""
return shapely.wkb.loads(str(x), hex=True)
if isinstance(geoms.iat[0], bytes):
load_geom = load_geom_bytes
else:
load_geom = load_geom_text
df[geom_col] = geoms = geoms.apply(load_geom)
if crs is None:
srid = shapely.get_srid(geoms.iat[0])
# if no defined SRID in geodatabase, returns SRID of 0
if srid != 0:
try:
spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
except pd.errors.DatabaseError:
warning_msg = (
f"Could not find the spatial reference system table "
f"(spatial_ref_sys) in PostGIS."
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
else:
if not spatial_ref_sys_df.empty:
auth_name = spatial_ref_sys_df["auth_name"].item()
crs = f"{auth_name}:{srid}"
else:
warning_msg = (
f"Could not find srid {srid} in the "
f"spatial_ref_sys table. "
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
return GeoDataFrame(df, crs=crs, geometry=geom_col)
def _read_postgis(
sql,
con,
geom_col="geom",
crs=None,
index_col=None,
coerce_float=True,
parse_dates=None,
params=None,
chunksize=None,
):
"""
Returns a GeoDataFrame corresponding to the result of the query
string, which must contain a geometry column in WKB representation.
It is also possible to use :meth:`~GeoDataFrame.read_file` to read from a database.
Especially for file geodatabases like GeoPackage or SpatiaLite this can be easier.
Parameters
----------
sql : string
SQL query to execute in selecting entries from database, or name
of the table to read from the database.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : dict or str, optional
CRS to use for the returned GeoDataFrame; if not set, tries to
determine CRS from the SRID associated with the first geometry in
the database, and assigns that to all geometries.
chunksize : int, default None
If specified, return an iterator where chunksize is the number of rows to
include in each chunk.
See the documentation for pandas.read_sql for further explanation
of the following parameters:
index_col, coerce_float, parse_dates, params, chunksize
Returns
-------
GeoDataFrame
Examples
--------
PostGIS
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> db_connection_url = "postgresql://myusername:mypassword@myhost:5432/mydatabase"
>>> con = create_engine(db_connection_url) # doctest: +SKIP
>>> sql = "SELECT geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
SpatiaLite
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
"""
if chunksize is None:
# read all in one chunk and return a single GeoDataFrame
df = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
else:
# read data in chunks and return a generator
df_generator = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return (
_df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
)
def _get_geometry_type(gdf):
"""
Get basic geometry type of a GeoDataFrame. See more info from:
https://geoalchemy-2.readthedocs.io/en/latest/types.html#geoalchemy2.types._GISType
Following rules apply:
- if geometries all share the same geometry-type,
geometries are inserted with the given GeometryType with following types:
- Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon,
GeometryCollection.
- LinearRing geometries will be converted into LineString -objects.
- in all other cases, geometries will be inserted with type GEOMETRY:
- a mix of Polygons and MultiPolygons in GeoSeries
- a mix of Points and LineStrings in GeoSeries
- geometry is of type GeometryCollection,
such as GeometryCollection([Point, LineStrings])
- if any of the geometries has Z-coordinate, all records will
be written with 3D.
"""
geom_types = list(gdf.geometry.geom_type.unique())
has_curve = False
for gt in geom_types:
if gt is None:
continue
elif "LinearRing" in gt:
has_curve = True
if len(geom_types) == 1:
if has_curve:
target_geom_type = "LINESTRING"
else:
if geom_types[0] is None:
raise ValueError("No valid geometries in the data.")
else:
target_geom_type = geom_types[0].upper()
else:
target_geom_type = "GEOMETRY"
# Check for 3D-coordinates
if any(gdf.geometry.has_z):
target_geom_type += "Z"
return target_geom_type, has_curve
def _get_srid_from_crs(gdf):
"""
Get EPSG code from CRS if available. If not, return 0.
"""
# Use geoalchemy2 default for srid
# Note: undefined srid in PostGIS is 0
srid = None
warning_msg = (
"Could not parse CRS from the GeoDataFrame. "
"Inserting data without defined CRS."
)
if gdf.crs is not None:
try:
for confidence in (100, 70, 25):
srid = gdf.crs.to_epsg(min_confidence=confidence)
if srid is not None:
break
auth_srid = gdf.crs.to_authority(
auth_name="ESRI", min_confidence=confidence
)
if auth_srid is not None:
srid = int(auth_srid[1])
break
except Exception:
warnings.warn(warning_msg, UserWarning, stacklevel=2)
if srid is None:
srid = 0
warnings.warn(warning_msg, UserWarning, stacklevel=2)
return srid
def _convert_linearring_to_linestring(gdf, geom_name):
from shapely.geometry import LineString
# Todo: Use shapely function once it's implemented:
# https://github.com/shapely/shapely/issues/1617
mask = gdf.geom_type == "LinearRing"
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
lambda geom: LineString(geom)
)
return gdf
def _convert_to_ewkb(gdf, geom_name, srid):
"""Convert geometries to ewkb."""
geoms = shapely.to_wkb(
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
# The gdf will warn that the geometry column doesn't hold in-memory geometries
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
# the user that the dtypes are unexpected.
df = pd.DataFrame(gdf, copy=False)
df[geom_name] = geoms
return df
def _psql_insert_copy(tbl, conn, keys, data_iter):
import csv
import io
s_buf = io.StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ", ".join('"{}"'.format(k) for k in keys)
dbapi_conn = conn.connection
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
tbl.table.schema, tbl.table.name, columns
)
with dbapi_conn.cursor() as cur:
# Use psycopg method if it's available
if hasattr(cur, "copy") and callable(cur.copy):
with cur.copy(sql) as copy:
copy.write(s_buf.read())
else: # otherwise use psycopg2 method
cur.copy_expert(sql, s_buf)
def _write_postgis(
gdf,
name,
con,
schema=None,
if_exists="fail",
index=False,
index_label=None,
chunksize=None,
dtype=None,
):
"""
Upload GeoDataFrame into PostGIS database.
This method requires SQLAlchemy and GeoAlchemy2, and a PostgreSQL
Python driver (e.g. psycopg2) to be installed.
Parameters
----------
name : str
Name of the target table.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the PostGIS database.
if_exists : {'fail', 'replace', 'append'}, default 'fail'
How to behave if the table already exists:
- fail: Raise a ValueError.
- replace: Drop the table before inserting new values.
- append: Insert new values to the existing table.
schema : string, optional
Specify the schema. If None, use default schema: 'public'.
index : bool, default True
Write DataFrame index as a column.
Uses *index_label* as the column name in the table.
index_label : string or sequence, default None
Column label for index column(s).
If None is given (default) and index is True,
then the index names are used.
chunksize : int, optional
Rows will be written in batches of this size at a time.
By default, all rows will be written at once.
dtype : dict of column name to SQL type, default None
Specifying the datatype for columns.
The keys should be the column names and the values
should be the SQLAlchemy types.
Examples
--------
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
/mydatabase";) # doctest: +SKIP
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
"""
try:
from geoalchemy2 import Geometry
from sqlalchemy import text
except ImportError:
raise ImportError("'to_postgis()' requires geoalchemy2 package.")
gdf = gdf.copy()
geom_name = gdf.geometry.name
# Get srid
srid = _get_srid_from_crs(gdf)
# Get geometry type and info whether data contains LinearRing.
geometry_type, has_curve = _get_geometry_type(gdf)
# Build dtype with Geometry
if dtype is not None:
dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
else:
dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
# Convert LinearRing geometries to LineString
if has_curve:
gdf = _convert_linearring_to_linestring(gdf, geom_name)
# Convert geometries to EWKB
gdf = _convert_to_ewkb(gdf, geom_name, srid)
if schema is not None:
schema_name = schema
else:
schema_name = "public"
if if_exists == "append":
# Check that the geometry srid matches with the current GeoDataFrame
with _get_conn(con) as connection:
# Only check SRID if table exists
if connection.dialect.has_table(connection, name, schema):
target_srid = connection.execute(
text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema=schema_name, table=name, geom_col=geom_name
)
)
).fetchone()[0]
if target_srid != srid:
msg = (
"The CRS of the target table (EPSG:{epsg_t}) differs from the "
"CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
epsg_t=target_srid, epsg_src=srid
)
)
raise ValueError(msg)
with _get_conn(con) as connection:
gdf.to_sql(
name,
connection,
schema=schema_name,
if_exists=if_exists,
index=index,
index_label=index_label,
chunksize=chunksize,
dtype=dtype,
method=_psql_insert_copy,
)
@lru_cache
def _get_spatial_ref_sys_df(con, srid):
spatial_ref_sys_sql = (
f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
)
return pd.read_sql(spatial_ref_sys_sql, con)
@@ -0,0 +1,100 @@
"""
Script to create the data and write legacy storage (pickle) files.
Based on pandas' generate_legacy_storage_files.py script.
To use this script, create an environment for which you want to
generate pickles, activate the environment, and run this script as:
$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py \
geopandas/geopandas/io/tests/data/pickle/ pickle
This script generates a storage file for the current arch, system,
The idea here is you are using the *current* version of the
generate_legacy_storage_files with an *older* version of geopandas to
generate a pickle file. We will then check this file into a current
branch, and test using test_pickle.py. This will load the *older*
pickles and test versus the current data that is generated
(with master). These are then compared.
"""
import os
import pickle
import platform
import sys
import pandas as pd
from shapely.geometry import Point
import geopandas
def create_pickle_data():
"""create the pickle data"""
# custom geometry column name
gdf_the_geom = geopandas.GeoDataFrame(
{"a": [1, 2, 3], "the_geom": [Point(1, 1), Point(2, 2), Point(3, 3)]},
geometry="the_geom",
)
# with crs
gdf_crs = geopandas.GeoDataFrame(
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
crs="EPSG:4326",
)
return {"gdf_the_geom": gdf_the_geom, "gdf_crs": gdf_crs}
def platform_name():
return "_".join(
[
str(geopandas.__version__),
"pd-" + str(pd.__version__),
"py-" + str(platform.python_version()),
str(platform.machine()),
str(platform.system().lower()),
]
)
def write_legacy_pickles(output_dir):
print(
"This script generates a storage file for the current arch, system, "
"and python version"
)
print("geopandas version: {}").format(geopandas.__version__)
print(" output dir : {}".format(output_dir))
print(" storage format: pickle")
pth = "{}.pickle".format(platform_name())
fh = open(os.path.join(output_dir, pth), "wb")
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
fh.close()
print("created pickle file: {}".format(pth))
def main():
if len(sys.argv) != 3:
sys.exit(
"Specify output directory and storage type: generate_legacy_"
"storage_files.py <output_dir> <storage_type> "
)
output_dir = str(sys.argv[1])
storage_type = str(sys.argv[2])
if storage_type == "pickle":
write_legacy_pickles(output_dir=output_dir)
else:
sys.exit("storage_type must be one of {'pickle'}")
if __name__ == "__main__":
main()
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,328 @@
import os
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
import geopandas
from geopandas import GeoDataFrame
from .test_file import FIONA_MARK, PYOGRIO_MARK
import pytest
from geopandas.testing import assert_geodataframe_equal
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
# *****************************************
# TEST TOOLING
class _ExpectedError:
def __init__(self, error_type, error_message_match):
self.type = error_type
self.match = error_message_match
class _ExpectedErrorBuilder:
def __init__(self, composite_key):
self.composite_key = composite_key
def to_raise(self, error_type, error_match):
_expected_exceptions[self.composite_key] = _ExpectedError(
error_type, error_match
)
def _expect_writing(gdf, ogr_driver):
return _ExpectedErrorBuilder(_composite_key(gdf, ogr_driver))
def _composite_key(gdf, ogr_driver):
return frozenset([id(gdf), ogr_driver])
def _expected_error_on(gdf, ogr_driver):
composite_key = _composite_key(gdf, ogr_driver)
return _expected_exceptions.get(composite_key, None)
# *****************************************
# TEST CASES
_geodataframes_to_write = []
_expected_exceptions = {}
_CRS = "epsg:4326"
# ------------------
# gdf with Points
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_entrance, city_hall_balcony]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPoint([city_hall_balcony, city_hall_council_chamber]),
MultiPoint([city_hall_entrance, city_hall_balcony, city_hall_council_chamber]),
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Points and MultiPoints
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony],
)
_geodataframes_to_write.append(gdf)
# 'ESRI Shapefile' driver supports writing LineString/MultiLinestring and
# Polygon/MultiPolygon but does not mention Point/MultiPoint
# see https://www.gdal.org/drv_shapefile.html
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with LineStrings
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=city_hall_walls)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), MultiLineString(city_hall_walls)],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with LineStrings and MultiLineStrings
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygons
gdf = GeoDataFrame(
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries, vauquelin_place]
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with MultiPolygon
gdf = GeoDataFrame(
{"a": [1]},
crs=_CRS,
geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with Polygon and MultiPolygon
gdf = GeoDataFrame(
{"a": [1, 2]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
],
)
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometry and 3D Point
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, point_3D])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with null geometries only
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, None])
_geodataframes_to_write.append(gdf)
# ------------------
# gdf with all shape types mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
# ------------------
# gdf with all 2D shape types and 3D Point mixed together
gdf = GeoDataFrame(
{"a": [1, 2, 3, 4, 5, 6, 7]},
crs=_CRS,
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_entrance,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
],
)
_geodataframes_to_write.append(gdf)
# Not supported by 'ESRI Shapefile' driver
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
@pytest.fixture(params=_geodataframes_to_write)
def geodataframe(request):
return request.param
@pytest.fixture(
params=[
("GeoJSON", ".geojson"),
("ESRI Shapefile", ".shp"),
("GPKG", ".gpkg"),
("SQLite", ".sqlite"),
]
)
def ogr_driver(request):
return request.param
@pytest.fixture(
params=[
pytest.param("fiona", marks=FIONA_MARK),
pytest.param("pyogrio", marks=PYOGRIO_MARK),
]
)
def engine(request):
return request.param
def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
driver, ext = ogr_driver
output_file = os.path.join(str(tmpdir), "output_file" + ext)
write_kwargs = {}
if driver == "SQLite":
write_kwargs["spatialite"] = True
# This if statement can be removed once minimal fiona version >= 1.8.20
if engine == "fiona":
from packaging.version import Version
import fiona
if Version(fiona.__version__) < Version("1.8.20"):
pytest.skip("SQLite driver only available from version 1.8.20")
# If only 3D Points, geometry_type needs to be specified for spatialite at the
# moment. This if can be removed once the following PR is released:
# https://github.com/geopandas/pyogrio/pull/223
if (
engine == "pyogrio"
and len(geodataframe == 2)
and geodataframe.geometry[0] is None
and geodataframe.geometry[1] is not None
and geodataframe.geometry[1].has_z
):
write_kwargs["geometry_type"] = "Point Z"
expected_error = _expected_error_on(geodataframe, driver)
if expected_error:
with pytest.raises(
RuntimeError, match="Failed to write record|Could not add feature to layer"
):
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
else:
if driver == "SQLite" and engine == "pyogrio":
try:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
except ValueError as e:
if "unrecognized option 'SPATIALITE'" in str(e):
pytest.xfail(
"pyogrio wheels from PyPI do not come with SpatiaLite support. "
f"Error: {e}"
)
raise
else:
geodataframe.to_file(
output_file, driver=driver, engine=engine, **write_kwargs
)
reloaded = geopandas.read_file(output_file, engine=engine)
if driver == "GeoJSON" and engine == "pyogrio":
# For GeoJSON files, the int64 column comes back as int32
reloaded["a"] = reloaded["a"].astype("int64")
assert_geodataframe_equal(geodataframe, reloaded, check_column_type="equiv")
@@ -0,0 +1,537 @@
import contextlib
import json
import os
import pathlib
from packaging.version import Version
import numpy as np
import shapely
from shapely import MultiPoint, Point, box
from geopandas import GeoDataFrame, GeoSeries
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
pytest.importorskip("pyarrow")
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import feather
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
def pa_table(table):
if Version(pa.__version__) < Version("14.0.0"):
return table._pa_table
else:
return pa.table(table)
def pa_array(array):
if Version(pa.__version__) < Version("14.0.0"):
return array._pa_array
else:
return pa.array(array)
def assert_table_equal(left, right, check_metadata=True):
geom_type = left["geometry"].type
# in case of Points (directly the inner fixed_size_list or struct type)
# -> there are NaNs for empties -> we need to compare them separately
# and then fill, because pyarrow.Table.equals considers NaNs as not equal
if pa.types.is_fixed_size_list(geom_type):
left_values = left["geometry"].chunk(0).values
right_values = right["geometry"].chunk(0).values
assert pc.is_nan(left_values).equals(pc.is_nan(right_values))
left_geoms = pa.FixedSizeListArray.from_arrays(
pc.replace_with_mask(left_values, pc.is_nan(left_values), 0.0),
type=left["geometry"].type,
)
right_geoms = pa.FixedSizeListArray.from_arrays(
pc.replace_with_mask(right_values, pc.is_nan(right_values), 0.0),
type=right["geometry"].type,
)
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
elif pa.types.is_struct(geom_type):
left_arr = left["geometry"].chunk(0)
right_arr = right["geometry"].chunk(0)
for i in range(left_arr.type.num_fields):
assert pc.is_nan(left_arr.field(i)).equals(pc.is_nan(right_arr.field(i)))
left_geoms = pa.StructArray.from_arrays(
[
pc.replace_with_mask(
left_arr.field(i), pc.is_nan(left_arr.field(i)), 0.0
)
for i in range(left_arr.type.num_fields)
],
fields=list(left["geometry"].type),
)
right_geoms = pa.StructArray.from_arrays(
[
pc.replace_with_mask(
right_arr.field(i), pc.is_nan(right_arr.field(i)), 0.0
)
for i in range(right_arr.type.num_fields)
],
fields=list(right["geometry"].type),
)
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
if left.equals(right, check_metadata=check_metadata):
return
if not left.schema.equals(right.schema):
raise AssertionError(
"Schema not equal\nLeft:\n{0}\nRight:\n{1}".format(
left.schema, right.schema
)
)
if check_metadata:
if not left.schema.equals(right.schema, check_metadata=True):
if not left.schema.metadata == right.schema.metadata:
raise AssertionError(
"Metadata not equal\nLeft:\n{0}\nRight:\n{1}".format(
left.schema.metadata, right.schema.metadata
)
)
for col in left.schema.names:
assert left.schema.field(col).equals(
right.schema.field(col), check_metadata=True
)
for col in left.column_names:
a_left = pa.concat_arrays(left.column(col).chunks)
a_right = pa.concat_arrays(right.column(col).chunks)
if not a_left.equals(a_right):
raise AssertionError(
"Column '{0}' not equal:\n{1}".format(col, a_left.diff(a_right))
)
raise AssertionError("Tables not equal for unknown reason")
@pytest.mark.skipif(
shapely.geos_version < (3, 9, 0),
reason="Checking for empty is buggy with GEOS<3.9",
) # an old GEOS is installed in the CI builds with the defaults channel
@pytest.mark.parametrize(
"dim",
[
"xy",
pytest.param(
"xyz",
marks=pytest.mark.skipif(
shapely.geos_version < (3, 10, 0),
reason="Cannot write 3D geometries with GEOS<3.10",
),
),
],
)
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
@pytest.mark.parametrize(
"geometry_encoding, interleaved",
[("WKB", None), ("geoarrow", True), ("geoarrow", False)],
ids=["WKB", "geoarrow-interleaved", "geoarrow-separated"],
)
def test_geoarrow_export(geometry_type, dim, geometry_encoding, interleaved):
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df["row_number"] = df["row_number"].astype("int32")
df = GeoDataFrame(df)
df.geometry.array.crs = None
# Read the expected data
if geometry_encoding == "WKB":
filename = f"example-{suffix}-wkb.arrow"
else:
filename = f"example-{suffix}{'-interleaved' if interleaved else ''}.arrow"
expected = feather.read_table(base_path / filename)
# GeoDataFrame -> Arrow Table
result = pa_table(
df.to_arrow(geometry_encoding=geometry_encoding, interleaved=interleaved)
)
# remove the "pandas" metadata
result = result.replace_schema_metadata(None)
mask_nonempty = None
if (
geometry_encoding == "WKB"
and dim == "xyz"
and geometry_type.startswith("multi")
):
# for collections with z dimension, drop the empties because those don't
# roundtrip correctly to WKB
# (https://github.com/libgeos/geos/issues/888)
mask_nonempty = pa.array(np.asarray(~df.geometry.is_empty))
result = result.filter(mask_nonempty)
expected = expected.filter(mask_nonempty)
assert_table_equal(result, expected)
# GeoSeries -> Arrow array
if geometry_encoding != "WKB" and geometry_type == "point":
# for points, we again have to handle NaNs separately, we already did that
# for table so let's just skip this part
return
result_arr = pa_array(
df.geometry.to_arrow(
geometry_encoding=geometry_encoding, interleaved=interleaved
)
)
if mask_nonempty is not None:
result_arr = result_arr.filter(mask_nonempty)
assert result_arr.equals(expected["geometry"].chunk(0))
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_multiple_geometry_crs(encoding):
pytest.importorskip("pyproj")
# ensure each geometry column has its own crs
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
gdf["geom2"] = gdf.geometry.to_crs("epsg:3857")
result = pa_table(gdf.to_arrow(geometry_encoding=encoding))
meta1 = json.loads(
result.schema.field("geometry").metadata[b"ARROW:extension:metadata"]
)
assert json.loads(meta1["crs"])["id"]["code"] == 4326
meta2 = json.loads(
result.schema.field("geom2").metadata[b"ARROW:extension:metadata"]
)
assert json.loads(meta2["crs"])["id"]["code"] == 3857
roundtripped = GeoDataFrame.from_arrow(result)
assert_geodataframe_equal(gdf, roundtripped)
assert gdf.geometry.crs == "epsg:4326"
assert gdf.geom2.crs == "epsg:3857"
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_series_name_crs(encoding):
pytest.importorskip("pyproj")
pytest.importorskip("pyarrow", minversion="14.0.0")
gser = GeoSeries([box(0, 0, 10, 10)], crs="epsg:4326", name="geom")
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
assert field.name == "geom"
assert (
field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb"
if encoding == "WKB"
else b"geoarrow.polygon"
)
meta = json.loads(field.metadata[b"ARROW:extension:metadata"])
assert json.loads(meta["crs"])["id"]["code"] == 4326
# ensure it also works without a name
gser = GeoSeries([box(0, 0, 10, 10)])
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
assert field.name == ""
def test_geoarrow_unsupported_encoding():
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
with pytest.raises(ValueError, match="Expected geometry encoding"):
gdf.to_arrow(geometry_encoding="invalid")
with pytest.raises(ValueError, match="Expected geometry encoding"):
gdf.geometry.to_arrow(geometry_encoding="invalid")
def test_geoarrow_mixed_geometry_types():
gdf = GeoDataFrame(
{"geometry": [Point(0, 0), box(0, 0, 10, 10)]},
crs="epsg:4326",
)
with pytest.raises(ValueError, match="Geometry type combination is not supported"):
gdf.to_arrow(geometry_encoding="geoarrow")
gdf = GeoDataFrame(
{"geometry": [Point(0, 0), MultiPoint([(0, 0), (1, 1)])]},
crs="epsg:4326",
)
result = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert (
result.schema.field("geometry").metadata[b"ARROW:extension:name"]
== b"geoarrow.multipoint"
)
@pytest.mark.parametrize("geom_type", ["point", "polygon"])
@pytest.mark.parametrize(
"encoding, interleaved", [("WKB", True), ("geoarrow", True), ("geoarrow", False)]
)
def test_geoarrow_missing(encoding, interleaved, geom_type):
# dummy test for single geometry type until missing values are included
# in the test data for test_geoarrow_export
gdf = GeoDataFrame(
geometry=[Point(0, 0) if geom_type == "point" else box(0, 0, 10, 10), None],
crs="epsg:4326",
)
if (
encoding == "geoarrow"
and geom_type == "point"
and interleaved
and Version(pa.__version__) < Version("15.0.0")
):
with pytest.raises(
ValueError,
match="Converting point geometries with missing values is not supported",
):
gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved)
return
result = pa_table(gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved))
assert result["geometry"].null_count == 1
assert result["geometry"].is_null().to_pylist() == [False, True]
def test_geoarrow_include_z():
gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1), Point()]})
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert table["geometry"].type.value_field.name == "xy"
assert table["geometry"].type.list_size == 2
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=True))
assert table["geometry"].type.value_field.name == "xyz"
assert table["geometry"].type.list_size == 3
assert np.isnan(table["geometry"].chunk(0).values.to_numpy()[2::3]).all()
gdf = GeoDataFrame({"geometry": [Point(0, 0, 0), Point(1, 1, 1), Point()]})
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
assert table["geometry"].type.value_field.name == "xyz"
assert table["geometry"].type.list_size == 3
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=False))
assert table["geometry"].type.value_field.name == "xy"
assert table["geometry"].type.list_size == 2
@contextlib.contextmanager
def with_geoarrow_extension_types():
gp = pytest.importorskip("geoarrow.pyarrow")
gp.register_extension_types()
try:
yield
finally:
gp.unregister_extension_types()
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
def test_geoarrow_export_with_extension_types(geometry_type, dim):
# ensure the exported data can be imported by geoarrow-pyarrow and are
# recognized as extension types
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df["row_number"] = df["row_number"].astype("int32")
df = GeoDataFrame(df)
df.geometry.array.crs = None
pytest.importorskip("geoarrow.pyarrow")
with with_geoarrow_extension_types():
result1 = pa_table(df.to_arrow(geometry_encoding="WKB"))
assert isinstance(result1["geometry"].type, pa.ExtensionType)
result2 = pa_table(df.to_arrow(geometry_encoding="geoarrow"))
assert isinstance(result2["geometry"].type, pa.ExtensionType)
result3 = pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
assert isinstance(result3["geometry"].type, pa.ExtensionType)
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
[
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
],
)
def test_geoarrow_import(geometry_type, dim):
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df = GeoDataFrame(df)
df.geometry.crs = None
table1 = feather.read_table(base_path / f"example-{suffix}-wkb.arrow")
result1 = GeoDataFrame.from_arrow(table1)
assert_geodataframe_equal(result1, df)
table2 = feather.read_table(base_path / f"example-{suffix}-interleaved.arrow")
result2 = GeoDataFrame.from_arrow(table2)
assert_geodataframe_equal(result2, df)
table3 = feather.read_table(base_path / f"example-{suffix}.arrow")
result3 = GeoDataFrame.from_arrow(table3)
assert_geodataframe_equal(result3, df)
@pytest.mark.skipif(
Version(shapely.__version__) < Version("2.0.2"),
reason="from_ragged_array failing with read-only array input",
)
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
def test_geoarrow_import_geometry_column(encoding):
pytest.importorskip("pyproj")
# ensure each geometry column has its own crs
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)])
gdf["centroid"] = gdf.geometry.centroid
result = GeoDataFrame.from_arrow(pa_table(gdf.to_arrow(geometry_encoding=encoding)))
assert_geodataframe_equal(result, gdf)
assert result.active_geometry_name == "geometry"
result = GeoDataFrame.from_arrow(
pa_table(gdf[["centroid"]].to_arrow(geometry_encoding=encoding))
)
assert result.active_geometry_name == "centroid"
result = GeoDataFrame.from_arrow(
pa_table(gdf.to_arrow(geometry_encoding=encoding)), geometry="centroid"
)
assert result.active_geometry_name == "centroid"
assert_geodataframe_equal(result, gdf.set_geometry("centroid"))
def test_geoarrow_import_missing_geometry():
pytest.importorskip("pyarrow", minversion="14.0.0")
table = pa.table({"a": [0, 1, 2], "b": [0.1, 0.2, 0.3]})
with pytest.raises(ValueError, match="No geometry column found"):
GeoDataFrame.from_arrow(table)
with pytest.raises(ValueError, match="No GeoArrow geometry field found"):
GeoSeries.from_arrow(table["a"].chunk(0))
def test_geoarrow_import_capsule_interface():
# ensure we can import non-pyarrow object
pytest.importorskip("pyarrow", minversion="14.0.0")
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
result = GeoDataFrame.from_arrow(gdf.to_arrow())
assert_geodataframe_equal(result, gdf)
@pytest.mark.parametrize("dim", ["xy", "xyz"])
@pytest.mark.parametrize(
"geometry_type",
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
)
def test_geoarrow_import_from_extension_types(geometry_type, dim):
# ensure the exported data can be imported by geoarrow-pyarrow and are
# recognized as extension types
pytest.importorskip("pyproj")
base_path = DATA_PATH / "geoarrow"
suffix = geometry_type + ("_z" if dim == "xyz" else "")
# Read the example data
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
df = GeoDataFrame(df, crs="EPSG:3857")
pytest.importorskip("geoarrow.pyarrow")
with with_geoarrow_extension_types():
result1 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="WKB"))
)
assert_geodataframe_equal(result1, df)
result2 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="geoarrow"))
)
assert_geodataframe_equal(result2, df)
result3 = GeoDataFrame.from_arrow(
pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
)
assert_geodataframe_equal(result3, df)
def test_geoarrow_import_geoseries():
pytest.importorskip("pyproj")
gp = pytest.importorskip("geoarrow.pyarrow")
ser = GeoSeries.from_wkt(["POINT (1 1)", "POINT (2 2)"], crs="EPSG:3857")
with with_geoarrow_extension_types():
arr = gp.array(ser.to_arrow(geometry_encoding="WKB"))
result = GeoSeries.from_arrow(arr)
assert_geoseries_equal(result, ser)
arr = gp.array(ser.to_arrow(geometry_encoding="geoarrow"))
result = GeoSeries.from_arrow(arr)
assert_geoseries_equal(result, ser)
# the name is lost when going through a pyarrow.Array
ser.name = "name"
arr = gp.array(ser.to_arrow())
result = GeoSeries.from_arrow(arr)
assert result.name is None
# we can specify the name as one of the kwargs
result = GeoSeries.from_arrow(arr, name="test")
assert_geoseries_equal(result, ser)
def test_geoarrow_import_unknown_geoarrow_type():
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
table = pa_table(gdf.to_arrow())
schema = table.schema
new_field = schema.field("geometry").with_metadata(
{
b"ARROW:extension:name": b"geoarrow.unknown",
b"ARROW:extension:metadata": b"{}",
}
)
new_schema = pa.schema([schema.field(0), new_field])
new_table = table.cast(new_schema)
with pytest.raises(TypeError, match="Unknown GeoArrow extension type"):
GeoDataFrame.from_arrow(new_table)
@@ -0,0 +1,306 @@
from collections import OrderedDict
import numpy as np
import pandas as pd
from shapely.geometry import (
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
from geopandas import GeoDataFrame
from geopandas.io.file import infer_schema
import pytest
# Credit: Polygons below come from Montreal city Open Data portal
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
city_hall_boundaries = Polygon(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
)
vauquelin_place = Polygon(
(
(-73.5542465586147, 45.5081555487952),
(-73.5540185061397, 45.5084409343852),
(-73.5546126200639, 45.5086813829106),
(-73.5548825850032, 45.5084033554357),
(-73.5542465586147, 45.5081555487952),
)
)
city_hall_walls = [
LineString(
(
(-73.5541107525234, 45.5091983609661),
(-73.5546126200639, 45.5086813829106),
(-73.5540185061397, 45.5084409343852),
)
),
LineString(
(
(-73.5539986525799, 45.5084323044531),
(-73.5535801792994, 45.5089539203786),
(-73.5541107525234, 45.5091983609661),
)
),
]
city_hall_entrance = Point(-73.553785, 45.508722)
city_hall_balcony = Point(-73.554138, 45.509080)
city_hall_council_chamber = Point(-73.554246, 45.508931)
point_3D = Point(-73.553785, 45.508722, 300)
linestring_3D = LineString(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5546126200639, 45.5086813829106, 300),
(-73.5540185061397, 45.5084409343852, 300),
)
)
polygon_3D = Polygon(
(
(-73.5541107525234, 45.5091983609661, 300),
(-73.5535801792994, 45.5089539203786, 300),
(-73.5541107525234, 45.5091983609661, 300),
)
)
def test_infer_schema_only_points():
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_points_and_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPoint", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipoints():
df = GeoDataFrame(
geometry=[
MultiPoint(
[city_hall_entrance, city_hall_balcony, city_hall_council_chamber]
)
]
)
assert infer_schema(df) == {"geometry": "MultiPoint", "properties": OrderedDict()}
def test_infer_schema_only_linestrings():
df = GeoDataFrame(geometry=city_hall_walls)
assert infer_schema(df) == {"geometry": "LineString", "properties": OrderedDict()}
def test_infer_schema_linestrings_and_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]])
assert infer_schema(df) == {
"geometry": ["MultiLineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multilinestrings():
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls)])
assert infer_schema(df) == {
"geometry": "MultiLineString",
"properties": OrderedDict(),
}
def test_infer_schema_only_polygons():
df = GeoDataFrame(geometry=[city_hall_boundaries, vauquelin_place])
assert infer_schema(df) == {"geometry": "Polygon", "properties": OrderedDict()}
def test_infer_schema_polygons_and_multipolygons():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
]
)
assert infer_schema(df) == {
"geometry": ["MultiPolygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_multipolygons():
df = GeoDataFrame(geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))])
assert infer_schema(df) == {"geometry": "MultiPolygon", "properties": OrderedDict()}
def test_infer_schema_multiple_shape_types():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
]
)
assert infer_schema(df) == {
"geometry": [
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_shape_type():
df = GeoDataFrame(
geometry=[
MultiPolygon((city_hall_boundaries, vauquelin_place)),
city_hall_boundaries,
MultiLineString(city_hall_walls),
city_hall_walls[0],
MultiPoint([city_hall_entrance, city_hall_balcony]),
city_hall_balcony,
point_3D,
]
)
assert infer_schema(df) == {
"geometry": [
"3D Point",
"MultiPolygon",
"Polygon",
"MultiLineString",
"LineString",
"MultiPoint",
"Point",
],
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Point():
df = GeoDataFrame(geometry=[city_hall_balcony, point_3D])
assert infer_schema(df) == {
"geometry": ["3D Point", "Point"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Points():
df = GeoDataFrame(geometry=[point_3D, point_3D])
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_mixed_3D_linestring():
df = GeoDataFrame(geometry=[city_hall_walls[0], linestring_3D])
assert infer_schema(df) == {
"geometry": ["3D LineString", "LineString"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_linestrings():
df = GeoDataFrame(geometry=[linestring_3D, linestring_3D])
assert infer_schema(df) == {
"geometry": "3D LineString",
"properties": OrderedDict(),
}
def test_infer_schema_mixed_3D_Polygon():
df = GeoDataFrame(geometry=[city_hall_boundaries, polygon_3D])
assert infer_schema(df) == {
"geometry": ["3D Polygon", "Polygon"],
"properties": OrderedDict(),
}
def test_infer_schema_only_3D_Polygons():
df = GeoDataFrame(geometry=[polygon_3D, polygon_3D])
assert infer_schema(df) == {"geometry": "3D Polygon", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_2D_point():
df = GeoDataFrame(geometry=[None, city_hall_entrance])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_and_3D_point():
df = GeoDataFrame(geometry=[None, point_3D])
# None geometry type is then omitted
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
def test_infer_schema_null_geometry_all():
df = GeoDataFrame(geometry=[None, None])
# None geometry type in then replaced by 'Unknown'
# (default geometry type supported by Fiona)
assert infer_schema(df) == {"geometry": "Unknown", "properties": OrderedDict()}
@pytest.mark.parametrize(
"array_data,dtype", [([1, 2**31 - 1], np.int32), ([1, np.nan], pd.Int32Dtype())]
)
def test_infer_schema_int32(array_data, dtype):
int32col = pd.array(data=array_data, dtype=dtype)
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int32_column"] = int32col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int32_column", "int32")]),
}
def test_infer_schema_int64():
int64col = pd.array([1, np.nan], dtype=pd.Int64Dtype())
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
df["int64_column"] = int64col
assert infer_schema(df) == {
"geometry": "Point",
"properties": OrderedDict([("int64_column", "int")]),
}
@@ -0,0 +1,56 @@
"""
See generate_legacy_storage_files.py for the creation of the legacy files.
"""
import glob
import os
import pathlib
import pandas as pd
import pytest
from geopandas.testing import assert_geodataframe_equal
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
@pytest.fixture(scope="module")
def current_pickle_data():
# our current version pickle data
from .generate_legacy_storage_files import create_pickle_data
return create_pickle_data()
files = glob.glob(str(DATA_PATH / "pickle" / "*.pickle"))
@pytest.fixture(params=files, ids=[p.split("/")[-1] for p in files])
def legacy_pickle(request):
return request.param
@pytest.mark.skip(
reason=(
"shapely 2.0/pygeos-based unpickling currently only works for "
"shapely-2.0/pygeos-written files"
),
)
def test_legacy_pickles(current_pickle_data, legacy_pickle):
result = pd.read_pickle(legacy_pickle)
for name, value in result.items():
expected = current_pickle_data[name]
assert_geodataframe_equal(value, expected)
def test_round_trip_current(tmpdir, current_pickle_data):
data = current_pickle_data
for name, value in data.items():
path = str(tmpdir / "{}.pickle".format(name))
value.to_pickle(path)
result = pd.read_pickle(path)
assert_geodataframe_equal(result, value)
assert isinstance(result.has_sindex, bool)
@@ -0,0 +1,878 @@
"""
Tests here include reading/writing to different types of spatial databases.
The spatial database tests may not work without additional system
configuration. postGIS tests require a test database to have been setup;
see geopandas.tests.util for more information.
"""
import os
import warnings
from importlib.util import find_spec
import pandas as pd
import geopandas
import geopandas._compat as compat
from geopandas import GeoDataFrame, read_file, read_postgis
from geopandas._compat import HAS_PYPROJ
from geopandas.io.sql import _get_conn as get_conn
from geopandas.io.sql import _write_postgis as write_postgis
import pytest
from geopandas.tests.util import (
create_postgis,
create_spatialite,
mock,
validate_boro_df,
)
try:
from sqlalchemy import text
except ImportError:
# Avoid local imports for text in all sqlalchemy tests
# all tests using text use engine_postgis, which ensures sqlalchemy is available
text = str
@pytest.fixture
def df_nybb(nybb_filename):
df = read_file(nybb_filename)
return df
def check_available_postgis_drivers() -> list[str]:
"""Work out which of psycopg2 and psycopg are available.
This prevents tests running if the relevant package isn't installed
(rather than being skipped, as skips are treated as failures during postgis CI)
"""
drivers = []
if find_spec("psycopg"):
drivers.append("psycopg")
if find_spec("psycopg2"):
drivers.append("psycopg2")
return drivers
POSTGIS_DRIVERS = check_available_postgis_drivers()
def prepare_database_credentials() -> dict:
"""Gather postgres connection credentials from environment variables."""
return {
"dbname": "test_geopandas",
"user": os.environ.get("PGUSER"),
"password": os.environ.get("PGPASSWORD"),
"host": os.environ.get("PGHOST"),
"port": os.environ.get("PGPORT"),
}
@pytest.fixture()
def connection_postgis(request):
"""Create a postgres connection using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS."""
psycopg = pytest.importorskip(request.param)
try:
con = psycopg.connect(**prepare_database_credentials())
except psycopg.OperationalError:
pytest.skip("Cannot connect with postgresql database")
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="pandas only supports SQLAlchemy connectable.*"
)
yield con
con.close()
@pytest.fixture()
def engine_postgis(request):
"""
Initiate a sqlalchemy connection engine using either psycopg2 or psycopg.
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS.
"""
sqlalchemy = pytest.importorskip("sqlalchemy")
from sqlalchemy.engine.url import URL
credentials = prepare_database_credentials()
try:
con = sqlalchemy.create_engine(
URL.create(
drivername=f"postgresql+{request.param}",
username=credentials["user"],
database=credentials["dbname"],
password=credentials["password"],
host=credentials["host"],
port=credentials["port"],
)
)
con.connect()
except Exception:
pytest.skip("Cannot connect with postgresql database")
yield con
con.dispose()
@pytest.fixture()
def connection_spatialite():
"""
Return a memory-based SQLite3 connection with SpatiaLite enabled & initialized.
`The sqlite3 module must be built with loadable extension support
<https://docs.python.org/3/library/sqlite3.html#f1>`_ and
`SpatiaLite <https://www.gaia-gis.it/fossil/libspatialite/index>`_
must be available on the system as a SQLite module.
Packages available on Anaconda meet requirements.
Exceptions
----------
``AttributeError`` on missing support for loadable SQLite extensions
``sqlite3.OperationalError`` on missing SpatiaLite
"""
sqlite3 = pytest.importorskip("sqlite3")
try:
with sqlite3.connect(":memory:") as con:
con.enable_load_extension(True)
con.load_extension("mod_spatialite")
con.execute("SELECT InitSpatialMetaData(TRUE)")
except Exception:
con.close()
pytest.skip("Cannot setup spatialite database")
yield con
con.close()
def drop_table_if_exists(conn_or_engine, table):
sqlalchemy = pytest.importorskip("sqlalchemy")
if sqlalchemy.inspect(conn_or_engine).has_table(table):
metadata = sqlalchemy.MetaData()
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", message="Did not recognize type 'geometry' of column.*"
)
metadata.reflect(conn_or_engine)
table = metadata.tables.get(table)
if table is not None:
table.drop(conn_or_engine, checkfirst=True)
@pytest.fixture
def df_mixed_single_and_multi():
from shapely.geometry import LineString, MultiLineString, Point
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0), (1, 1)]),
MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
Point(0, 1),
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_geom_collection():
from shapely.geometry import GeometryCollection, LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
"geometry": [
GeometryCollection(
[
Polygon([(0, 0), (1, 1), (0, 1)]),
LineString([(0, 0), (1, 1)]),
Point(0, 0),
]
)
]
},
crs="epsg:4326",
)
return df
@pytest.fixture
def df_linear_ring():
from shapely.geometry import LinearRing
df = geopandas.GeoDataFrame(
{"geometry": [LinearRing(((0, 0), (0, 1), (1, 1), (1, 0)))]}, crs="epsg:4326"
)
return df
@pytest.fixture
def df_3D_geoms():
from shapely.geometry import LineString, Point, Polygon
df = geopandas.GeoDataFrame(
{
"geometry": [
LineString([(0, 0, 0), (1, 1, 1)]),
Polygon([(0, 0, 0), (1, 1, 1), (0, 1, 1)]),
Point(0, 1, 2),
]
},
crs="epsg:4326",
)
return df
class TestIO:
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_get_conn(self, engine_postgis):
Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
engine = engine_postgis
with get_conn(engine) as output:
assert isinstance(output, Connection)
with engine.connect() as conn:
with get_conn(conn) as output:
assert isinstance(output, Connection)
with pytest.raises(ValueError):
with get_conn(object()):
pass
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
con = connection_postgis
orig_geom = "geom"
out_geom = "the_geom"
create_postgis(con, df_nybb, geom_col=orig_geom)
sql = """SELECT borocode, boroname, shape_leng, shape_area,
{} as {} FROM nybb;""".format(
orig_geom, out_geom
)
df = read_postgis(sql, con, geom_col=out_geom)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
con = connection_postgis
crs = "epsg:4269"
df_reproj = df_nybb.to_crs(crs)
create_postgis(con, df_reproj, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
"""Tests that a user specified CRS overrides the geodatabase SRID."""
con = connection_postgis
orig_crs = df_nybb.crs
create_postgis(con, df_nybb, srid=4269)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con, crs=orig_crs)
validate_boro_df(df)
assert df.crs == orig_crs
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_default(self, connection_postgis, df_nybb):
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con)
validate_boro_df(df, case_sensitive=False)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
con = connection_postgis
geom_col = "the_geom"
create_postgis(con, df_nybb, geom_col=geom_col)
sql = "SELECT * FROM nybb;"
df = GeoDataFrame.from_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df, case_sensitive=False)
def test_read_postgis_null_geom(self, connection_spatialite, df_nybb):
"""Tests that geometry with NULL is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
df_nybb.geometry.iat[0] = None
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'AsEWKB("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
def test_read_postgis_binary(self, connection_spatialite, df_nybb):
"""Tests that geometry read as binary is accepted."""
con = connection_spatialite
geom_col = df_nybb.geometry.name
create_spatialite(con, df_nybb)
sql = (
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
'ST_AsBinary("{0}") AS "{0}" FROM nybb'.format(geom_col)
)
df = read_postgis(sql, con, geom_col=geom_col)
validate_boro_df(df)
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument"""
chunksize = 2
con = connection_postgis
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
# no crs defined on the created geodatabase, and none specified
# by user; should not be set to 0, as from get_srid failure
assert df.crs is None
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_default(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
engine = engine_postgis
table = "aTestTable"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
# Write to db
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
# Validate
sql = text('SELECT * FROM "{table}";'.format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
with engine_postgis.begin() as con:
table = "nybb_con"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(con, table)
# Write to db
write_postgis(df_nybb, con=con, name=table, if_exists="fail")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, con, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that uploading the same table raises error when: if_replace='fail'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
try:
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
except ValueError as e:
if "already exists" in str(e):
pass
else:
raise e
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that replacing a table is possible when: if_replace='replace'.
"""
engine = engine_postgis
table = "nybb"
# Ensure table exists
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Overwrite
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
"""
Tests that appending to existing table produces correct results when:
if_replace='append'.
"""
engine = engine_postgis
table = "nybb"
orig_rows, orig_cols = df_nybb.shape
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Validate
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
new_rows, new_cols = df.shape
# There should be twice as many rows in the new table
assert new_rows == orig_rows * 2, (
"There should be {target} rows,found: {current}".format(
target=orig_rows * 2, current=new_rows
),
)
# Number of columns should stay the same
assert new_cols == orig_cols, (
"There should be {target} columns,found: {current}".format(
target=orig_cols, current=new_cols
),
)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS without CRS information.
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb.geometry.array.crs = None
with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Validate that srid is -1
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 0, "SRID should be 0, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
"""
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
CRS information (GH #2414).
"""
engine = engine_postgis
table = "nybb"
# Write to db
df_nybb_esri = df_nybb.to_crs("ESRI:102003")
write_postgis(df_nybb_esri, con=engine, name=table, if_exists="replace")
# Validate that srid is 102003
sql = text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema="public", table=table, geom_col="geometry"
)
)
with engine.connect() as conn:
target_srid = conn.execute(sql).fetchone()[0]
assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_geometry_collection(
self, engine_postgis, df_geom_collection
):
"""
Tests that writing a mix of different geometry types is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_geom_collection, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert geom_type.upper() == "GEOMETRYCOLLECTION"
assert df.geom_type.unique()[0] == "GeometryCollection"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_mixed_geometry_types(
self, engine_postgis, df_mixed_single_and_multi
):
"""
Tests that writing a mix of single and MultiGeometries is possible.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi, con=engine, name=table, if_exists="replace"
)
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
"""
Tests that writing a LinearRing.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_linear_ring, con=engine, name=table, if_exists="replace")
# Validate geometry type
sql = text(
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
geom_type = conn.execute(sql).fetchone()[0]
assert geom_type.upper() == "LINESTRING"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
"""
Tests writing a LinearRing works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(
df_mixed_single_and_multi,
con=engine,
name=table,
if_exists="replace",
chunksize=1,
)
# Validate row count
sql = text("SELECT COUNT(geometry) FROM {table};".format(table=table))
with engine.connect() as conn:
row_cnt = conn.execute(sql).fetchone()[0]
assert row_cnt == 3
# Validate geometry type
sql = text(
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
table=table
)
)
with engine.connect() as conn:
res = conn.execute(sql).fetchall()
assert res[0][0].upper() == "LINESTRING"
assert res[1][0].upper() == "MULTILINESTRING"
assert res[2][0].upper() == "POINT"
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_to_different_schema_when_table_exists(
self, engine_postgis, df_nybb
):
"""
Tests writing data to alternative schema.
"""
engine = engine_postgis
table = "nybb"
schema_to_use = "test"
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
with engine.begin() as conn:
conn.execute(sql)
try:
write_postgis(
df_nybb, con=engine, name=table, if_exists="fail", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(
schema=schema_to_use, table=table
)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
# Should raise a ValueError when table exists
except ValueError:
pass
# Try with replace flag on
write_postgis(
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
)
# Validate
sql = text(
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
)
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
"""
Tests writing a geometries with 3 dimensions works.
"""
engine = engine_postgis
table = "geomtype_tests"
write_postgis(df_3D_geoms, con=engine, name=table, if_exists="replace")
# Check that all geometries have 3 dimensions
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert list(df.geometry.has_z) == [True, True, True]
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_row_order(self, engine_postgis, df_nybb):
"""
Tests that the row order in db table follows the order of the original frame.
"""
engine = engine_postgis
table = "row_order_test"
correct_order = df_nybb["BoroCode"].tolist()
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
assert df["BoroCode"].tolist() == correct_order
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_before_table_exists(self, engine_postgis, df_nybb):
"""
Tests that insert works with if_exists='append' when table does not exist yet.
"""
engine = engine_postgis
table = "nybb"
# If table exists, delete it before trying to write with defaults
drop_table_if_exists(engine, table)
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
# Check that the row order matches
sql = text("SELECT * FROM {table};".format(table=table))
df = read_postgis(sql, engine, geom_col="geometry")
validate_boro_df(df)
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_with_different_crs(self, engine_postgis, df_nybb):
"""
Tests that the warning is raised if table CRS differs from frame.
"""
engine = engine_postgis
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# Reproject
df_nybb2 = df_nybb.to_crs(epsg=4326)
# Should raise error when appending
with pytest.raises(ValueError, match="CRS of the target table"):
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
def test_append_without_crs(self, engine_postgis, df_nybb):
# This test was included in #3328 when the default value for no
# CRS was changed from an SRID of -1 to 0. This resolves issues
# of appending dataframes to postgis that have no CRS as postgis
# no CRS value is 0.
engine = engine_postgis
df_nybb = df_nybb.set_crs(None, allow_override=True)
table = "nybb"
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
# append another dataframe with no crs
df_nybb2 = df_nybb
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
@pytest.mark.xfail(
compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
reason="Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1",
)
def test_duplicate_geometry_column_fails(self, engine_postgis):
"""
Tests that a ValueError is raised if an SQL query returns two geometry columns.
"""
engine = engine_postgis
sql = "select ST_MakePoint(0, 0) as geom, ST_MakePoint(0, 0) as geom;"
with pytest.raises(ValueError):
read_postgis(sql, engine, geom_col="geom")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs(self, connection_postgis, df_nybb):
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = read_postgis(sql, con)
validate_boro_df(df)
assert df.crs == "ESRI:54052"
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
@mock.patch("shapely.get_srid")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_srid_not_in_table(self, mock_get_srid, connection_postgis, df_nybb):
# mock a non-existent srid for edge case if shapely has an srid
# not present in postgis table.
pyproj = pytest.importorskip("pyproj")
mock_get_srid.return_value = 99999
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb)
sql = "SELECT * FROM nybb;"
with pytest.raises(pyproj.exceptions.CRSError, match="crs not found"):
with pytest.warns(UserWarning, match="Could not find srid 99999"):
read_postgis(sql, con)
@mock.patch("geopandas.io.sql._get_spatial_ref_sys_df")
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_no_spatial_ref_sys_table_in_postgis(
self, mock_get_spatial_ref_sys_df, connection_postgis, df_nybb
):
# mock for a non-existent spatial_ref_sys database
mock_get_spatial_ref_sys_df.side_effect = pd.errors.DatabaseError
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="epsg:4326")
create_postgis(con, df_nybb, srid=4326)
sql = "SELECT * FROM nybb;"
with pytest.warns(
UserWarning, match="Could not find the spatial reference system table"
):
df = read_postgis(sql, con)
assert df.crs == "EPSG:4326"
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
def test_read_non_epsg_crs_chunksize(self, connection_postgis, df_nybb):
"""Test chunksize argument with non epsg crs"""
chunksize = 2
con = connection_postgis
df_nybb = df_nybb.to_crs(crs="esri:54052")
create_postgis(con, df_nybb, srid=54052)
sql = "SELECT * FROM nybb;"
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
validate_boro_df(df)
assert df.crs == "ESRI:54052"
@@ -0,0 +1,118 @@
"""Vendored, cut down version of pyogrio/util.py for use with fiona"""
import re
import sys
from urllib.parse import urlparse
def vsi_path(path: str) -> str:
"""
Ensure path is a local path or a GDAL-compatible vsi path.
"""
# path is already in GDAL format
if path.startswith("/vsi"):
return path
# Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
# URL schemes
if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
if not path.split("!")[0].endswith(".zip"):
return path
# prefix then allow to proceed with remaining parsing
path = f"zip://{path}"
path, archive, scheme = _parse_uri(path)
if scheme or archive or path.endswith(".zip"):
return _construct_vsi_path(path, archive, scheme)
return path
# Supported URI schemes and their mapping to GDAL's VSI suffix.
SCHEMES = {
"file": "file",
"zip": "zip",
"tar": "tar",
"gzip": "gzip",
"http": "curl",
"https": "curl",
"ftp": "curl",
"s3": "s3",
"gs": "gs",
"az": "az",
"adls": "adls",
"adl": "adls", # fsspec uses this
"hdfs": "hdfs",
"webhdfs": "webhdfs",
# GDAL additionally supports oss and swift for remote filesystems, but
# those are for now not added as supported URI
}
CURLSCHEMES = {k for k, v in SCHEMES.items() if v == "curl"}
def _parse_uri(path: str):
"""
Parse a URI
Returns a tuples of (path, archive, scheme)
path : str
Parsed path. Includes the hostname and query string in the case
of a URI.
archive : str
Parsed archive path.
scheme : str
URI scheme such as "https" or "zip+s3".
"""
parts = urlparse(path, allow_fragments=False)
# if the scheme is not one of GDAL's supported schemes, return raw path
if parts.scheme and not all(p in SCHEMES for p in parts.scheme.split("+")):
return path, "", ""
# we have a URI
path = parts.path
scheme = parts.scheme or ""
if parts.query:
path += "?" + parts.query
if parts.scheme and parts.netloc:
path = parts.netloc + path
parts = path.split("!")
path = parts.pop() if parts else ""
archive = parts.pop() if parts else ""
return (path, archive, scheme)
def _construct_vsi_path(path, archive, scheme) -> str:
"""Convert a parsed path to a GDAL VSI path"""
prefix = ""
suffix = ""
schemes = scheme.split("+")
if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
schemes.insert(0, "zip")
if schemes:
prefix = "/".join(
"vsi{0}".format(SCHEMES[p]) for p in schemes if p and p != "file"
)
if schemes[-1] in CURLSCHEMES:
suffix = f"{schemes[-1]}://"
if prefix:
if archive:
return "/{}/{}{}/{}".format(prefix, suffix, archive, path.lstrip("/"))
else:
return "/{}/{}{}".format(prefix, suffix, path)
return path
@@ -0,0 +1,977 @@
import warnings
from packaging.version import Version
import numpy as np
import pandas as pd
from pandas import CategoricalDtype
from pandas.plotting import PlotAccessor
import geopandas
from ._decorator import doc
def _sanitize_geoms(geoms, prefix="Multi"):
"""
Returns Series like geoms and index, except that any Multi geometries
are split into their components and indices are repeated for all component
in the same Multi geometry. At the same time, empty or missing geometries are
filtered out. Maintains 1:1 matching of geometry to value.
Prefix specifies type of geometry to be flatten. 'Multi' for MultiPoint and similar,
"Geom" for GeometryCollection.
Returns
-------
components : list of geometry
component_index : index array
indices are repeated for all components in the same Multi geometry
"""
# TODO(shapely) look into simplifying this with
# shapely.get_parts(geoms, return_index=True) from shapely 2.0
components, component_index = [], []
if (
not geoms.geom_type.str.startswith(prefix).any()
and not geoms.is_empty.any()
and not geoms.isna().any()
):
return geoms, np.arange(len(geoms))
for ix, geom in enumerate(geoms):
if geom is not None and geom.geom_type.startswith(prefix) and not geom.is_empty:
for poly in geom.geoms:
components.append(poly)
component_index.append(ix)
elif geom is None or geom.is_empty:
continue
else:
components.append(geom)
component_index.append(ix)
return components, np.array(component_index)
def _expand_kwargs(kwargs, multiindex):
"""
Most arguments to the plot functions must be a (single) value, or a sequence
of values. This function checks each key-value pair in 'kwargs' and expands
it (in place) to the correct length/formats with help of 'multiindex', unless
the value appears to already be a valid (single) value for the key.
"""
from typing import Iterable
from matplotlib.colors import is_color_like
scalar_kwargs = ["marker", "path_effects"]
for att, value in kwargs.items():
if "color" in att: # color(s), edgecolor(s), facecolor(s)
if is_color_like(value):
continue
elif "linestyle" in att: # linestyle(s)
# A single linestyle can be 2-tuple of a number and an iterable.
if (
isinstance(value, tuple)
and len(value) == 2
and isinstance(value[1], Iterable)
):
continue
elif att in scalar_kwargs:
# For these attributes, only a single value is allowed, so never expand.
continue
if pd.api.types.is_list_like(value):
kwargs[att] = np.take(value, multiindex, axis=0)
def _PolygonPatch(polygon, **kwargs):
"""Constructs a matplotlib patch from a Polygon geometry
The `kwargs` are those supported by the matplotlib.patches.PathPatch class
constructor. Returns an instance of matplotlib.patches.PathPatch.
Example (using Shapely Point and a matplotlib axes)::
b = shapely.geometry.Point(0, 0).buffer(1.0)
patch = _PolygonPatch(b, fc='blue', ec='blue', alpha=0.5)
ax.add_patch(patch)
GeoPandas originally relied on the descartes package by Sean Gillies
(BSD license, https://pypi.org/project/descartes) for PolygonPatch, but
this dependency was removed in favor of the below matplotlib code.
"""
from matplotlib.patches import PathPatch
from matplotlib.path import Path
path = Path.make_compound_path(
Path(np.asarray(polygon.exterior.coords)[:, :2]),
*[Path(np.asarray(ring.coords)[:, :2]) for ring in polygon.interiors],
)
return PathPatch(path, **kwargs)
def _plot_polygon_collection(
ax,
geoms,
values=None,
color=None,
cmap=None,
vmin=None,
vmax=None,
autolim=True,
**kwargs,
):
"""
Plots a collection of Polygon and MultiPolygon geometries to `ax`
Parameters
----------
ax : matplotlib.axes.Axes
where shapes will be plotted
geoms : a sequence of `N` Polygons and/or MultiPolygons (can be mixed)
values : a sequence of `N` values, optional
Values will be mapped to colors using vmin/vmax/cmap. They should
have 1:1 correspondence with the geometries (not their components).
Otherwise follows `color` / `facecolor` kwargs.
edgecolor : single color or sequence of `N` colors
Color for the edge of the polygons
facecolor : single color or sequence of `N` colors
Color to fill the polygons. Cannot be used together with `values`.
color : single color or sequence of `N` colors
Sets both `edgecolor` and `facecolor`
autolim : bool (default True)
Update axes data limits to contain the new geometries.
**kwargs
Additional keyword arguments passed to the collection
Returns
-------
collection : matplotlib.collections.Collection that was plotted
"""
from matplotlib.collections import PatchCollection
geoms, multiindex = _sanitize_geoms(geoms)
if values is not None:
values = np.take(values, multiindex, axis=0)
# PatchCollection does not accept some kwargs.
kwargs = {
att: value
for att, value in kwargs.items()
if att not in ["markersize", "marker"]
}
# Add to kwargs for easier checking below.
if color is not None:
kwargs["color"] = color
_expand_kwargs(kwargs, multiindex)
collection = PatchCollection([_PolygonPatch(poly) for poly in geoms], **kwargs)
if values is not None:
collection.set_array(np.asarray(values))
collection.set_cmap(cmap)
if "norm" not in kwargs:
collection.set_clim(vmin, vmax)
ax.add_collection(collection, autolim=autolim)
ax.autoscale_view()
return collection
def _plot_linestring_collection(
ax,
geoms,
values=None,
color=None,
cmap=None,
vmin=None,
vmax=None,
autolim=True,
**kwargs,
):
"""
Plots a collection of LineString and MultiLineString geometries to `ax`
Parameters
----------
ax : matplotlib.axes.Axes
where shapes will be plotted
geoms : a sequence of `N` LineStrings and/or MultiLineStrings (can be
mixed)
values : a sequence of `N` values, optional
Values will be mapped to colors using vmin/vmax/cmap. They should
have 1:1 correspondence with the geometries (not their components).
color : single color or sequence of `N` colors
Cannot be used together with `values`.
autolim : bool (default True)
Update axes data limits to contain the new geometries.
Returns
-------
collection : matplotlib.collections.Collection that was plotted
"""
from matplotlib.collections import LineCollection
geoms, multiindex = _sanitize_geoms(geoms)
if values is not None:
values = np.take(values, multiindex, axis=0)
# LineCollection does not accept some kwargs.
kwargs = {
att: value
for att, value in kwargs.items()
if att not in ["markersize", "marker"]
}
# Add to kwargs for easier checking below.
if color is not None:
kwargs["color"] = color
_expand_kwargs(kwargs, multiindex)
segments = [np.array(linestring.coords)[:, :2] for linestring in geoms]
collection = LineCollection(segments, **kwargs)
if values is not None:
collection.set_array(np.asarray(values))
collection.set_cmap(cmap)
if "norm" not in kwargs:
collection.set_clim(vmin, vmax)
ax.add_collection(collection, autolim=autolim)
ax.autoscale_view()
return collection
def _plot_point_collection(
ax,
geoms,
values=None,
color=None,
cmap=None,
vmin=None,
vmax=None,
marker="o",
markersize=None,
**kwargs,
):
"""
Plots a collection of Point and MultiPoint geometries to `ax`
Parameters
----------
ax : matplotlib.axes.Axes
where shapes will be plotted
geoms : sequence of `N` Points or MultiPoints
values : a sequence of `N` values, optional
Values mapped to colors using vmin, vmax, and cmap.
Cannot be specified together with `color`.
markersize : scalar or array-like, optional
Size of the markers. Note that under the hood ``scatter`` is
used, so the specified value will be proportional to the
area of the marker (size in points^2).
Returns
-------
collection : matplotlib.collections.Collection that was plotted
"""
if values is not None and color is not None:
raise ValueError("Can only specify one of 'values' and 'color' kwargs")
geoms, multiindex = _sanitize_geoms(geoms)
# values are expanded below as kwargs["c"]
x = [p.x if not p.is_empty else None for p in geoms]
y = [p.y if not p.is_empty else None for p in geoms]
# matplotlib 1.4 does not support c=None, and < 2.0 does not support s=None
if values is not None:
kwargs["c"] = values
if markersize is not None:
kwargs["s"] = markersize
# Add to kwargs for easier checking below.
if color is not None:
kwargs["color"] = color
if marker is not None:
kwargs["marker"] = marker
_expand_kwargs(kwargs, multiindex)
if "norm" not in kwargs:
collection = ax.scatter(x, y, vmin=vmin, vmax=vmax, cmap=cmap, **kwargs)
else:
collection = ax.scatter(x, y, cmap=cmap, **kwargs)
return collection
def plot_series(
s,
cmap=None,
color=None,
ax=None,
figsize=None,
aspect="auto",
autolim=True,
**style_kwds,
):
"""
Plot a GeoSeries.
Generate a plot of a GeoSeries geometry with matplotlib.
Parameters
----------
s : Series
The GeoSeries to be plotted. Currently Polygon,
MultiPolygon, LineString, MultiLineString, Point and MultiPoint
geometries can be plotted.
cmap : str (default None)
The name of a colormap recognized by matplotlib. Any
colormap will work, but categorical colormaps are
generally recommended. Examples of useful discrete
colormaps include:
tab10, tab20, Accent, Dark2, Paired, Pastel1, Set1, Set2
color : str, np.array, pd.Series, List (default None)
If specified, all objects will be colored uniformly.
ax : matplotlib.pyplot.Artist (default None)
axes on which to draw the plot
figsize : pair of floats (default None)
Size of the resulting matplotlib.figure.Figure. If the argument
ax is given explicitly, figsize is ignored.
aspect : 'auto', 'equal', None or float (default 'auto')
Set aspect of axis. If 'auto', the default aspect for map plots is 'equal'; if
however data are not projected (coordinates are long/lat), the aspect is by
default set to 1/cos(s_y * pi/180) with s_y the y coordinate of the middle of
the GeoSeries (the mean of the y range of bounding box) so that a long/lat
square appears square in the middle of the plot. This implies an
Equirectangular projection. If None, the aspect of `ax` won't be changed. It can
also be set manually (float) as the ratio of y-unit to x-unit.
autolim : bool (default True)
Update axes data limits to contain the new geometries.
**style_kwds : dict
Color options to be passed on to the actual plot function, such
as ``edgecolor``, ``facecolor``, ``linewidth``, ``markersize``,
``alpha``.
Returns
-------
ax : matplotlib axes instance
"""
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
"The matplotlib package is required for plotting in geopandas. "
"You can install it using 'conda install -c conda-forge matplotlib' or "
"'pip install matplotlib'."
)
if ax is None:
fig, ax = plt.subplots(figsize=figsize)
if aspect == "auto":
if s.crs and s.crs.is_geographic:
bounds = s.total_bounds
y_coord = np.mean([bounds[1], bounds[3]])
ax.set_aspect(1 / np.cos(y_coord * np.pi / 180))
# formula ported from R package sp
# https://github.com/edzer/sp/blob/master/R/mapasp.R
else:
ax.set_aspect("equal")
elif aspect is not None:
ax.set_aspect(aspect)
if s.empty:
warnings.warn(
"The GeoSeries you are attempting to plot is "
"empty. Nothing has been displayed.",
UserWarning,
stacklevel=3,
)
return ax
if s.is_empty.all():
warnings.warn(
"The GeoSeries you are attempting to plot is "
"composed of empty geometries. Nothing has been displayed.",
UserWarning,
stacklevel=3,
)
return ax
# have colors been given for all geometries?
color_given = pd.api.types.is_list_like(color) and len(color) == len(s)
# if cmap is specified, create range of colors based on cmap
values = None
if cmap is not None:
values = np.arange(len(s))
if hasattr(cmap, "N"):
values = values % cmap.N
style_kwds["vmin"] = style_kwds.get("vmin", values.min())
style_kwds["vmax"] = style_kwds.get("vmax", values.max())
# decompose GeometryCollections
geoms, multiindex = _sanitize_geoms(s.geometry, prefix="Geom")
values = np.take(values, multiindex, axis=0) if cmap else None
# ensure indexes are consistent
if color_given and isinstance(color, pd.Series):
color = color.reindex(s.index)
expl_color = np.take(color, multiindex, axis=0) if color_given else color
expl_series = geopandas.GeoSeries(geoms)
geom_types = expl_series.geom_type
poly_idx = np.asarray((geom_types == "Polygon") | (geom_types == "MultiPolygon"))
line_idx = np.asarray(
(geom_types == "LineString")
| (geom_types == "MultiLineString")
| (geom_types == "LinearRing")
)
point_idx = np.asarray((geom_types == "Point") | (geom_types == "MultiPoint"))
# plot all Polygons and all MultiPolygon components in the same collection
polys = expl_series[poly_idx]
if not polys.empty:
# color overrides both face and edgecolor. As we want people to be
# able to use edgecolor as well, pass color to facecolor
facecolor = style_kwds.pop("facecolor", None)
color_ = expl_color[poly_idx] if color_given else color
if color is not None:
facecolor = color_
values_ = values[poly_idx] if cmap else None
_plot_polygon_collection(
ax,
polys,
values_,
facecolor=facecolor,
cmap=cmap,
autolim=autolim,
**style_kwds,
)
# plot all LineStrings and MultiLineString components in same collection
lines = expl_series[line_idx]
if not lines.empty:
values_ = values[line_idx] if cmap else None
color_ = expl_color[line_idx] if color_given else color
_plot_linestring_collection(
ax, lines, values_, color=color_, cmap=cmap, autolim=autolim, **style_kwds
)
# plot all Points in the same collection
points = expl_series[point_idx]
if not points.empty:
values_ = values[point_idx] if cmap else None
color_ = expl_color[point_idx] if color_given else color
_plot_point_collection(
ax, points, values_, color=color_, cmap=cmap, **style_kwds
)
ax.figure.canvas.draw_idle()
return ax
def plot_dataframe(
df,
column=None,
cmap=None,
color=None,
ax=None,
cax=None,
categorical=False,
legend=False,
scheme=None,
k=5,
vmin=None,
vmax=None,
markersize=None,
figsize=None,
legend_kwds=None,
categories=None,
classification_kwds=None,
missing_kwds=None,
aspect="auto",
autolim=True,
**style_kwds,
):
"""
Plot a GeoDataFrame.
Generate a plot of a GeoDataFrame with matplotlib. If a
column is specified, the plot coloring will be based on values
in that column.
Parameters
----------
column : str, np.array, pd.Series (default None)
The name of the dataframe column, np.array, or pd.Series to be plotted.
If np.array or pd.Series are used then it must have same length as
dataframe. Values are used to color the plot. Ignored if `color` is
also set.
kind: str
The kind of plots to produce. The default is to create a map ("geo").
Other supported kinds of plots from pandas:
- 'line' : line plot
- 'bar' : vertical bar plot
- 'barh' : horizontal bar plot
- 'hist' : histogram
- 'box' : BoxPlot
- 'kde' : Kernel Density Estimation plot
- 'density' : same as 'kde'
- 'area' : area plot
- 'pie' : pie plot
- 'scatter' : scatter plot
- 'hexbin' : hexbin plot.
cmap : str (default None)
The name of a colormap recognized by matplotlib.
color : str, np.array, pd.Series (default None)
If specified, all objects will be colored uniformly.
ax : matplotlib.pyplot.Artist (default None)
axes on which to draw the plot
cax : matplotlib.pyplot Artist (default None)
axes on which to draw the legend in case of color map.
categorical : bool (default False)
If False, cmap will reflect numerical values of the
column being plotted. For non-numerical columns, this
will be set to True.
legend : bool (default False)
Plot a legend. Ignored if no `column` is given, or if `color` is given.
scheme : str (default None)
Name of a choropleth classification scheme (requires mapclassify).
A mapclassify.MapClassifier object will be used
under the hood. Supported are all schemes provided by mapclassify (e.g.
'BoxPlot', 'EqualInterval', 'FisherJenks', 'FisherJenksSampled',
'HeadTailBreaks', 'JenksCaspall', 'JenksCaspallForced',
'JenksCaspallSampled', 'MaxP', 'MaximumBreaks',
'NaturalBreaks', 'Quantiles', 'Percentiles', 'StdMean',
'UserDefined'). Arguments can be passed in classification_kwds.
k : int (default 5)
Number of classes (ignored if scheme is None)
vmin : None or float (default None)
Minimum value of cmap. If None, the minimum data value
in the column to be plotted is used.
vmax : None or float (default None)
Maximum value of cmap. If None, the maximum data value
in the column to be plotted is used.
markersize : str or float or sequence (default None)
Only applies to point geometries within a frame.
If a str, will use the values in the column of the frame specified
by markersize to set the size of markers. Otherwise can be a value
to apply to all points, or a sequence of the same length as the
number of points.
figsize : tuple of integers (default None)
Size of the resulting matplotlib.figure.Figure. If the argument
axes is given explicitly, figsize is ignored.
legend_kwds : dict (default None)
Keyword arguments to pass to :func:`matplotlib.pyplot.legend` or
:func:`matplotlib.pyplot.colorbar`.
Additional accepted keywords when `scheme` is specified:
fmt : string
A formatting specification for the bin edges of the classes in the
legend. For example, to have no decimals: ``{"fmt": "{:.0f}"}``.
labels : list-like
A list of legend labels to override the auto-generated labels.
Needs to have the same number of elements as the number of
classes (`k`).
interval : boolean (default False)
An option to control brackets from mapclassify legend.
If True, open/closed interval brackets are shown in the legend.
categories : list-like
Ordered list-like object of categories to be used for categorical plot.
classification_kwds : dict (default None)
Keyword arguments to pass to mapclassify
missing_kwds : dict (default None)
Keyword arguments specifying color options (as style_kwds)
to be passed on to geometries with missing values in addition to
or overwriting other style kwds. If None, geometries with missing
values are not plotted.
aspect : 'auto', 'equal', None or float (default 'auto')
Set aspect of axis. If 'auto', the default aspect for map plots is 'equal'; if
however data are not projected (coordinates are long/lat), the aspect is by
default set to 1/cos(df_y * pi/180) with df_y the y coordinate of the middle of
the GeoDataFrame (the mean of the y range of bounding box) so that a long/lat
square appears square in the middle of the plot. This implies an
Equirectangular projection. If None, the aspect of `ax` won't be changed. It can
also be set manually (float) as the ratio of y-unit to x-unit.
autolim : bool (default True)
Update axes data limits to contain the new geometries.
**style_kwds : dict
Style options to be passed on to the actual plot function, such
as ``edgecolor``, ``facecolor``, ``linewidth``, ``markersize``,
``alpha``.
Returns
-------
ax : matplotlib axes instance
Examples
--------
>>> import geodatasets
>>> df = geopandas.read_file(geodatasets.get_path("nybb"))
>>> df.head() # doctest: +SKIP
BoroCode ... geometry
0 5 ... MULTIPOLYGON (((970217.022 145643.332, 970227....
1 4 ... MULTIPOLYGON (((1029606.077 156073.814, 102957...
2 3 ... MULTIPOLYGON (((1021176.479 151374.797, 102100...
3 1 ... MULTIPOLYGON (((981219.056 188655.316, 980940....
4 2 ... MULTIPOLYGON (((1012821.806 229228.265, 101278...
>>> df.plot("BoroName", cmap="Set1") # doctest: +SKIP
See the User Guide page :doc:`../../user_guide/mapping` for details.
"""
if column is not None and color is not None:
warnings.warn(
"Only specify one of 'column' or 'color'. Using 'color'.",
UserWarning,
stacklevel=3,
)
column = None
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
"The matplotlib package is required for plotting in geopandas. "
"You can install it using 'conda install -c conda-forge matplotlib' or "
"'pip install matplotlib'."
)
if ax is None:
if cax is not None:
raise ValueError("'ax' can not be None if 'cax' is not.")
fig, ax = plt.subplots(figsize=figsize)
if aspect == "auto":
if df.crs and df.crs.is_geographic:
bounds = df.total_bounds
y_coord = np.mean([bounds[1], bounds[3]])
ax.set_aspect(1 / np.cos(y_coord * np.pi / 180))
# formula ported from R package sp
# https://github.com/edzer/sp/blob/master/R/mapasp.R
else:
ax.set_aspect("equal")
elif aspect is not None:
ax.set_aspect(aspect)
# GH 1555
# if legend_kwds set, copy so we don't update it in place
if legend_kwds is not None:
legend_kwds = legend_kwds.copy()
if df.empty:
warnings.warn(
"The GeoDataFrame you are attempting to plot is "
"empty. Nothing has been displayed.",
UserWarning,
stacklevel=3,
)
return ax
if isinstance(markersize, str):
markersize = df[markersize].values
if column is None:
return plot_series(
df.geometry,
cmap=cmap,
color=color,
ax=ax,
figsize=figsize,
markersize=markersize,
aspect=aspect,
autolim=autolim,
**style_kwds,
)
# To accept pd.Series and np.arrays as column
if isinstance(column, (np.ndarray, pd.Series)):
if column.shape[0] != df.shape[0]:
raise ValueError(
"The dataframe and given column have different number of rows."
)
else:
values = column
# Make sure index of a Series matches index of df
if isinstance(values, pd.Series):
values = values.reindex(df.index)
else:
values = df[column]
if isinstance(values.dtype, CategoricalDtype):
if categories is not None:
raise ValueError(
"Cannot specify 'categories' when column has categorical dtype"
)
categorical = True
elif (
pd.api.types.is_object_dtype(values.dtype)
or pd.api.types.is_bool_dtype(values.dtype)
or pd.api.types.is_string_dtype(values.dtype)
or categories
):
categorical = True
nan_idx = np.asarray(pd.isna(values), dtype="bool")
if scheme is not None:
mc_err = (
"The 'mapclassify' package (>= 2.4.0) is "
"required to use the 'scheme' keyword."
)
try:
import mapclassify
except ImportError:
raise ImportError(mc_err)
if Version(mapclassify.__version__) < Version("2.4.0"):
raise ImportError(mc_err)
if classification_kwds is None:
classification_kwds = {}
if "k" not in classification_kwds:
classification_kwds["k"] = k
binning = mapclassify.classify(
np.asarray(values[~nan_idx]), scheme, **classification_kwds
)
# set categorical to True for creating the legend
categorical = True
if legend_kwds is not None and "labels" in legend_kwds:
if len(legend_kwds["labels"]) != binning.k:
raise ValueError(
"Number of labels must match number of bins, "
"received {} labels for {} bins".format(
len(legend_kwds["labels"]), binning.k
)
)
else:
labels = list(legend_kwds.pop("labels"))
else:
fmt = "{:.2f}"
if legend_kwds is not None and "fmt" in legend_kwds:
fmt = legend_kwds.pop("fmt")
labels = binning.get_legend_classes(fmt)
if legend_kwds is not None:
show_interval = legend_kwds.pop("interval", False)
else:
show_interval = False
if not show_interval:
labels = [c[1:-1] for c in labels]
values = pd.Categorical(
[np.nan] * len(values), categories=binning.bins, ordered=True
)
values[~nan_idx] = pd.Categorical.from_codes(
binning.yb, categories=binning.bins, ordered=True
)
if cmap is None:
cmap = "viridis"
# Define `values` as a Series
if categorical:
if cmap is None:
cmap = "tab10"
cat = pd.Categorical(values, categories=categories)
categories = list(cat.categories)
# values missing in the Categorical but not in original values
missing = list(np.unique(values[~nan_idx & cat.isna()]))
if missing:
raise ValueError(
"Column contains values not listed in categories. "
"Missing categories: {}.".format(missing)
)
values = cat.codes[~nan_idx]
vmin = 0 if vmin is None else vmin
vmax = len(categories) - 1 if vmax is None else vmax
# fill values with placeholder where were NaNs originally to map them properly
# (after removing them in categorical or scheme)
if categorical:
for n in np.where(nan_idx)[0]:
values = np.insert(values, n, values[0])
mn = values[~np.isnan(values)].min() if vmin is None else vmin
mx = values[~np.isnan(values)].max() if vmax is None else vmax
# decompose GeometryCollections
geoms, multiindex = _sanitize_geoms(df.geometry, prefix="Geom")
values = np.take(values, multiindex, axis=0)
nan_idx = np.take(nan_idx, multiindex, axis=0)
expl_series = geopandas.GeoSeries(geoms)
geom_types = expl_series.geom_type
poly_idx = np.asarray((geom_types == "Polygon") | (geom_types == "MultiPolygon"))
line_idx = np.asarray(
(geom_types == "LineString")
| (geom_types == "MultiLineString")
| (geom_types == "LinearRing")
)
point_idx = np.asarray((geom_types == "Point") | (geom_types == "MultiPoint"))
# plot all Polygons and all MultiPolygon components in the same collection
polys = expl_series[poly_idx & np.invert(nan_idx)]
subset = values[poly_idx & np.invert(nan_idx)]
if not polys.empty:
_plot_polygon_collection(
ax,
polys,
subset,
vmin=mn,
vmax=mx,
cmap=cmap,
autolim=autolim,
**style_kwds,
)
# plot all LineStrings and MultiLineString components in same collection
lines = expl_series[line_idx & np.invert(nan_idx)]
subset = values[line_idx & np.invert(nan_idx)]
if not lines.empty:
_plot_linestring_collection(
ax,
lines,
subset,
vmin=mn,
vmax=mx,
cmap=cmap,
autolim=autolim,
**style_kwds,
)
# plot all Points in the same collection
points = expl_series[point_idx & np.invert(nan_idx)]
subset = values[point_idx & np.invert(nan_idx)]
if not points.empty:
if isinstance(markersize, np.ndarray):
markersize = np.take(markersize, multiindex, axis=0)
markersize = markersize[point_idx & np.invert(nan_idx)]
_plot_point_collection(
ax,
points,
subset,
vmin=mn,
vmax=mx,
markersize=markersize,
cmap=cmap,
**style_kwds,
)
missing_data = not expl_series[nan_idx].empty
if missing_kwds is not None and missing_data:
if color:
if "color" not in missing_kwds:
missing_kwds["color"] = color
merged_kwds = style_kwds.copy()
merged_kwds.update(missing_kwds)
plot_series(expl_series[nan_idx], ax=ax, **merged_kwds)
if legend and not color:
if legend_kwds is None:
legend_kwds = {}
if "fmt" in legend_kwds:
legend_kwds.pop("fmt")
from matplotlib import cm
from matplotlib.colors import Normalize
from matplotlib.lines import Line2D
norm = style_kwds.get("norm", None)
if not norm:
norm = Normalize(vmin=mn, vmax=mx)
n_cmap = cm.ScalarMappable(norm=norm, cmap=cmap)
if categorical:
if scheme is not None:
categories = labels
patches = []
for i in range(len(categories)):
patches.append(
Line2D(
[0],
[0],
linestyle="none",
marker="o",
alpha=style_kwds.get("alpha", 1),
markersize=10,
markerfacecolor=n_cmap.to_rgba(i),
markeredgewidth=0,
)
)
if missing_kwds is not None and missing_data:
if "color" in merged_kwds:
merged_kwds["facecolor"] = merged_kwds["color"]
patches.append(
Line2D(
[0],
[0],
linestyle="none",
marker="o",
alpha=merged_kwds.get("alpha", 1),
markersize=10,
markerfacecolor=merged_kwds.get("facecolor", None),
markeredgecolor=merged_kwds.get("edgecolor", None),
markeredgewidth=merged_kwds.get(
"linewidth", 1 if merged_kwds.get("edgecolor", False) else 0
),
)
)
categories.append(merged_kwds.get("label", "NaN"))
legend_kwds.setdefault("numpoints", 1)
legend_kwds.setdefault("loc", "best")
legend_kwds.setdefault("handles", patches)
legend_kwds.setdefault("labels", categories)
ax.legend(**legend_kwds)
else:
if cax is not None:
legend_kwds.setdefault("cax", cax)
else:
legend_kwds.setdefault("ax", ax)
n_cmap.set_array(np.array([]))
ax.get_figure().colorbar(n_cmap, **legend_kwds)
ax.figure.canvas.draw_idle()
return ax
@doc(plot_dataframe)
class GeoplotAccessor(PlotAccessor):
_pandas_kinds = PlotAccessor._all_kinds
def __call__(self, *args, **kwargs):
data = self._parent.copy()
kind = kwargs.pop("kind", "geo")
if kind == "geo":
return plot_dataframe(data, *args, **kwargs)
if kind in self._pandas_kinds:
# Access pandas plots
return PlotAccessor(data)(kind=kind, **kwargs)
else:
# raise error
raise ValueError(f"{kind} is not a valid plot kind")
def geo(self, *args, **kwargs):
return self(kind="geo", *args, **kwargs) # noqa: B026
@@ -0,0 +1,505 @@
import numpy as np
import shapely
from shapely.geometry.base import BaseGeometry
from . import _compat as compat
from . import array, geoseries
PREDICATES = {p.name for p in shapely.strtree.BinaryPredicate} | {None}
if compat.GEOS_GE_310:
PREDICATES.update(["dwithin"])
class SpatialIndex:
"""A simple wrapper around Shapely's STRTree.
Parameters
----------
geometry : np.array of Shapely geometries
Geometries from which to build the spatial index.
"""
def __init__(self, geometry):
# set empty geometries to None to avoid segfault on GEOS <= 3.6
# see:
# https://github.com/pygeos/pygeos/issues/146
# https://github.com/pygeos/pygeos/issues/147
non_empty = geometry.copy()
non_empty[shapely.is_empty(non_empty)] = None
# set empty geometries to None to maintain indexing
self._tree = shapely.STRtree(non_empty)
# store geometries, including empty geometries for user access
self.geometries = geometry.copy()
@property
def valid_query_predicates(self):
"""Returns valid predicates for the spatial index.
Returns
-------
set
Set of valid predicates for this spatial index.
Examples
--------
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries([Point(0, 0), Point(1, 1)])
>>> s.sindex.valid_query_predicates # doctest: +SKIP
{None, "contains", "contains_properly", "covered_by", "covers", \
"crosses", "dwithin", "intersects", "overlaps", "touches", "within"}
"""
return PREDICATES
def query(
self, geometry, predicate=None, sort=False, distance=None, output_format="tuple"
):
"""
Return the integer indices of all combinations of each input geometry
and tree geometries where the bounding box of each input geometry
intersects the bounding box of a tree geometry.
If the input geometry is a scalar, this returns an array of shape (n, ) with
the indices of the matching tree geometries. If the input geometry is an
array_like, this returns an array with shape (2,n) where the subarrays
correspond to the indices of the input geometries and indices of the
tree geometries associated with each. To generate an array of pairs of
input geometry index and tree geometry index, simply transpose the
result.
If a predicate is provided, the tree geometries are first queried based
on the bounding box of the input geometry and then are further filtered
to those that meet the predicate when comparing the input geometry to
the tree geometry: ``predicate(geometry, tree_geometry)``.
The 'dwithin' predicate requires GEOS >= 3.10.
Bounding boxes are limited to two dimensions and are axis-aligned
(equivalent to the ``bounds`` property of a geometry); any Z values
present in input geometries are ignored when querying the tree.
Any input geometry that is None or empty will never match geometries in
the tree.
Parameters
----------
geometry : shapely.Geometry or array-like of geometries \
(numpy.ndarray, GeoSeries, GeometryArray)
A single shapely geometry or array of geometries to query against
the spatial index. For array-like, accepts both GeoPandas geometry
iterables (GeoSeries, GeometryArray) or a numpy array of Shapely
geometries.
predicate : {None, "contains", "contains_properly", "covered_by", "covers", \
"crosses", "intersects", "overlaps", "touches", "within", "dwithin"}, optional
If predicate is provided, the input geometries are tested
using the predicate function against each item in the tree
whose extent intersects the envelope of the input geometry:
``predicate(input_geometry, tree_geometry)``.
If possible, prepared geometries are used to help speed up the
predicate operation.
sort : bool, default False
If True, the results will be sorted in ascending order. In case
of 2D array, the result is sorted lexicographically using the
geometries' indexes as the primary key and the sindex's indexes
as the secondary key.
If False, no additional sorting is applied (results are often
sorted but there is no guarantee).
distance : number or array_like, optional
Distances around each input geometry within which to query the tree for
the 'dwithin' predicate. If array_like, shape must be broadcastable to shape
of geometry. Required if ``predicate='dwithin'``.
Returns
-------
ndarray with shape (n,) if geometry is a scalar
Integer indices for matching geometries from the spatial index
tree geometries.
OR
ndarray with shape (2, n) if geometry is an array_like
The first subarray contains input geometry integer indices.
The second subarray contains tree geometry integer indices.
Examples
--------
>>> from shapely.geometry import Point, box
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
Querying the tree with a scalar geometry:
>>> s.sindex.query(box(1, 1, 3, 3))
array([1, 2, 3])
>>> s.sindex.query(box(1, 1, 3, 3), predicate="contains")
array([2])
Querying the tree with an array of geometries:
>>> s2 = geopandas.GeoSeries([box(2, 2, 4, 4), box(5, 5, 6, 6)])
>>> s2
0 POLYGON ((4 2, 4 4, 2 4, 2 2, 4 2))
1 POLYGON ((6 5, 6 6, 5 6, 5 5, 6 5))
dtype: geometry
>>> s.sindex.query(s2)
array([[0, 0, 0, 1, 1],
[2, 3, 4, 5, 6]])
>>> s.sindex.query(s2, predicate="contains")
array([[0],
[3]])
>>> s.sindex.query(box(1, 1, 3, 3), predicate="dwithin", distance=0)
array([1, 2, 3])
>>> s.sindex.query(box(1, 1, 3, 3), predicate="dwithin", distance=2)
array([0, 1, 2, 3, 4])
Notes
-----
In the context of a spatial join, input geometries are the "left"
geometries that determine the order of the results, and tree geometries
are "right" geometries that are joined against the left geometries. This
effectively performs an inner join, where only those combinations of
geometries that can be joined based on overlapping bounding boxes or
optional predicate are returned.
"""
if predicate not in self.valid_query_predicates:
if predicate == "dwithin":
raise ValueError("predicate = 'dwithin' requires GEOS >= 3.10.0")
raise ValueError(
"Got predicate='{}'; ".format(predicate)
+ "`predicate` must be one of {}".format(self.valid_query_predicates)
)
# distance argument requirement of predicate `dwithin`
# and only valid for predicate `dwithin`
kwargs = {}
if predicate == "dwithin":
if distance is None:
# the distance parameter is needed
raise ValueError(
"'distance' parameter is required for 'dwithin' predicate"
)
# add distance to kwargs
kwargs["distance"] = distance
elif distance is not None:
# distance parameter is invalid
raise ValueError(
"'distance' parameter is only supported in combination with "
"'dwithin' predicate"
)
geometry = self._as_geometry_array(geometry)
indices = self._tree.query(geometry, predicate=predicate, **kwargs)
if output_format != "tuple":
sort = True
if sort:
if indices.ndim == 1:
indices = np.sort(indices)
else:
# sort by first array (geometry) and then second (tree)
geo_idx, tree_idx = indices
sort_indexer = np.lexsort((tree_idx, geo_idx))
indices = np.vstack((geo_idx[sort_indexer], tree_idx[sort_indexer]))
if output_format == "sparse":
from scipy.sparse import coo_array
return coo_array(
(np.ones(len(indices[0]), dtype=np.bool_), indices),
shape=(len(self.geometries), len(geometry)),
dtype=np.bool_,
)
if output_format == "dense":
dense = np.zeros((len(self.geometries), len(geometry)), dtype=bool)
dense[indices] = True
return dense
if output_format == "tuple":
return indices
raise ValueError("Invalid output_format: {}".format(output_format))
@staticmethod
def _as_geometry_array(geometry):
"""Convert geometry into a numpy array of Shapely geometries.
Parameters
----------
geometry
An array-like of Shapely geometries, a GeoPandas GeoSeries/GeometryArray,
shapely.geometry or list of shapely geometries.
Returns
-------
np.ndarray
A numpy array of Shapely geometries.
"""
if isinstance(geometry, np.ndarray):
return array.from_shapely(geometry)._data
elif isinstance(geometry, geoseries.GeoSeries):
return geometry.values._data
elif isinstance(geometry, array.GeometryArray):
return geometry._data
elif isinstance(geometry, BaseGeometry):
return geometry
elif geometry is None:
return None
else:
return np.asarray(geometry)
def nearest(
self,
geometry,
return_all=True,
max_distance=None,
return_distance=False,
exclusive=False,
):
"""
Return the nearest geometry in the tree for each input geometry in
``geometry``.
If multiple tree geometries have the same distance from an input geometry,
multiple results will be returned for that input geometry by default.
Specify ``return_all=False`` to only get a single nearest geometry
(non-deterministic which nearest is returned).
In the context of a spatial join, input geometries are the "left"
geometries that determine the order of the results, and tree geometries
are "right" geometries that are joined against the left geometries.
If ``max_distance`` is not set, this will effectively be a left join
because every geometry in ``geometry`` will have a nearest geometry in
the tree. However, if ``max_distance`` is used, this becomes an
inner join, since some geometries in ``geometry`` may not have a match
in the tree.
For performance reasons, it is highly recommended that you set
the ``max_distance`` parameter.
Parameters
----------
geometry : {shapely.geometry, GeoSeries, GeometryArray, numpy.array of Shapely \
geometries}
A single shapely geometry, one of the GeoPandas geometry iterables
(GeoSeries, GeometryArray), or a numpy array of Shapely geometries to query
against the spatial index.
return_all : bool, default True
If there are multiple equidistant or intersecting nearest
geometries, return all those geometries instead of a single
nearest geometry.
max_distance : float, optional
Maximum distance within which to query for nearest items in tree.
Must be greater than 0. By default None, indicating no distance limit.
return_distance : bool, optional
If True, will return distances in addition to indexes. By default False
exclusive : bool, optional
if True, the nearest geometries that are equal to the input geometry
will not be returned. By default False. Requires Shapely >= 2.0.
Returns
-------
Indices or tuple of (indices, distances)
Indices is an ndarray of shape (2,n) and distances (if present) an
ndarray of shape (n).
The first subarray of indices contains input geometry indices.
The second subarray of indices contains tree geometry indices.
Examples
--------
>>> from shapely.geometry import Point, box
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s.head()
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
dtype: geometry
>>> s.sindex.nearest(Point(1, 1))
array([[0],
[1]])
>>> s.sindex.nearest([box(4.9, 4.9, 5.1, 5.1)])
array([[0],
[5]])
>>> s2 = geopandas.GeoSeries(geopandas.points_from_xy([7.6, 10], [7.6, 10]))
>>> s2
0 POINT (7.6 7.6)
1 POINT (10 10)
dtype: geometry
>>> s.sindex.nearest(s2)
array([[0, 1],
[8, 9]])
"""
geometry = self._as_geometry_array(geometry)
if isinstance(geometry, BaseGeometry) or geometry is None:
geometry = [geometry]
result = self._tree.query_nearest(
geometry,
max_distance=max_distance,
return_distance=return_distance,
all_matches=return_all,
exclusive=exclusive,
)
if return_distance:
indices, distances = result
else:
indices = result
if return_distance:
return indices, distances
else:
return indices
def intersection(self, coordinates):
"""Compatibility wrapper for rtree.index.Index.intersection,
use ``query`` instead.
Parameters
----------
coordinates : sequence or array
Sequence of the form (min_x, min_y, max_x, max_y)
to query a rectangle or (x, y) to query a point.
Examples
--------
>>> from shapely.geometry import Point, box
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
>>> s.sindex.intersection(box(1, 1, 3, 3).bounds)
array([1, 2, 3])
Alternatively, you can use ``query``:
>>> s.sindex.query(box(1, 1, 3, 3))
array([1, 2, 3])
"""
# TODO: we should deprecate this
# convert bounds to geometry
# the old API uses tuples of bound, but Shapely uses geometries
try:
iter(coordinates)
except TypeError:
# likely not an iterable
# this is a check that rtree does, we mimic it
# to ensure a useful failure message
raise TypeError(
"Invalid coordinates, must be iterable in format "
"(minx, miny, maxx, maxy) (for bounds) or (x, y) (for points). "
"Got `coordinates` = {}.".format(coordinates)
)
# need to convert tuple of bounds to a geometry object
if len(coordinates) == 4:
indexes = self._tree.query(shapely.box(*coordinates))
elif len(coordinates) == 2:
indexes = self._tree.query(shapely.points(*coordinates))
else:
raise TypeError(
"Invalid coordinates, must be iterable in format "
"(minx, miny, maxx, maxy) (for bounds) or (x, y) (for points). "
"Got `coordinates` = {}.".format(coordinates)
)
return indexes
@property
def size(self):
"""Size of the spatial index
Number of leaves (input geometries) in the index.
Examples
--------
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
>>> s.sindex.size
10
"""
return len(self._tree)
@property
def is_empty(self):
"""Check if the spatial index is empty
Examples
--------
>>> from shapely.geometry import Point
>>> s = geopandas.GeoSeries(geopandas.points_from_xy(range(10), range(10)))
>>> s
0 POINT (0 0)
1 POINT (1 1)
2 POINT (2 2)
3 POINT (3 3)
4 POINT (4 4)
5 POINT (5 5)
6 POINT (6 6)
7 POINT (7 7)
8 POINT (8 8)
9 POINT (9 9)
dtype: geometry
>>> s.sindex.is_empty
False
>>> s2 = geopandas.GeoSeries()
>>> s2.sindex.is_empty
True
"""
return len(self._tree) == 0
def __len__(self):
return len(self._tree)
@@ -0,0 +1,358 @@
"""
Testing functionality for geopandas objects.
"""
import warnings
import pandas as pd
from geopandas import GeoDataFrame, GeoSeries
from geopandas.array import GeometryDtype
def _isna(this):
"""isna version that works for both scalars and (Geo)Series"""
with warnings.catch_warnings():
# GeoSeries.isna will raise a warning about no longer returning True
# for empty geometries. This helper is used below always in combination
# with an is_empty check to preserve behaviour, and thus we ignore the
# warning here to avoid it bubbling up to the user
warnings.filterwarnings(
"ignore", r"GeoSeries.isna\(\) previously returned", UserWarning
)
if hasattr(this, "isna"):
return this.isna()
elif hasattr(this, "isnull"):
return this.isnull()
else:
return pd.isnull(this)
def _geom_equals_mask(this, that):
"""
Test for geometric equality. Empty or missing geometries are considered
equal.
Parameters
----------
this, that : arrays of Geo objects (or anything that has an `is_empty`
attribute)
Returns
-------
Series
boolean Series, True if geometries in left equal geometries in right
"""
return (
this.geom_equals(that)
| (this.is_empty & that.is_empty)
| (_isna(this) & _isna(that))
)
def geom_equals(this, that):
"""
Test for geometric equality. Empty or missing geometries are considered
equal.
Parameters
----------
this, that : arrays of Geo objects (or anything that has an `is_empty`
attribute)
Returns
-------
bool
True if all geometries in left equal geometries in right
"""
return _geom_equals_mask(this, that).all()
def _geom_almost_equals_mask(this, that):
"""
Test for 'almost' geometric equality. Empty or missing geometries
considered equal.
This method allows small difference in the coordinates, but this
requires coordinates be in the same order for all components of a geometry.
Parameters
----------
this, that : arrays of Geo objects
Returns
-------
Series
boolean Series, True if geometries in left almost equal geometries in right
"""
return (
this.geom_equals_exact(that, tolerance=0.5 * 10 ** (-6))
| (this.is_empty & that.is_empty)
| (_isna(this) & _isna(that))
)
def geom_almost_equals(this, that):
"""
Test for 'almost' geometric equality. Empty or missing geometries
considered equal.
This method allows small difference in the coordinates, but this
requires coordinates be in the same order for all components of a geometry.
Parameters
----------
this, that : arrays of Geo objects (or anything that has an `is_empty`
property)
Returns
-------
bool
True if all geometries in left almost equal geometries in right
"""
if isinstance(this, GeoDataFrame) and isinstance(that, GeoDataFrame):
this = this.geometry
that = that.geometry
return _geom_almost_equals_mask(this, that).all()
def assert_geoseries_equal(
left,
right,
check_dtype=True,
check_index_type=False,
check_series_type=True,
check_less_precise=False,
check_geom_type=False,
check_crs=True,
normalize=False,
):
"""
Test util for checking that two GeoSeries are equal.
Parameters
----------
left, right : two GeoSeries
check_dtype : bool, default False
If True, check geo dtype [only included so it's a drop-in replacement
for assert_series_equal].
check_index_type : bool, default False
Check that index types are equal.
check_series_type : bool, default True
Check that both are same type (*and* are GeoSeries). If False,
will attempt to convert both into GeoSeries.
check_less_precise : bool, default False
If True, use geom_equals_exact with relative error of 0.5e-6.
If False, use geom_equals.
check_geom_type : bool, default False
If True, check that all the geom types are equal.
check_crs: bool, default True
If `check_series_type` is True, then also check that the
crs matches.
normalize: bool, default False
If True, normalize the geometries before comparing equality.
Typically useful with ``check_less_precise=True``, which uses
``geom_equals_exact`` and requires exact coordinate order.
"""
assert len(left) == len(right), "%d != %d" % (len(left), len(right))
if check_dtype:
msg = "dtype should be a GeometryDtype, got {0}"
assert isinstance(left.dtype, GeometryDtype), msg.format(left.dtype)
assert isinstance(right.dtype, GeometryDtype), msg.format(left.dtype)
if check_index_type:
assert isinstance(left.index, type(right.index))
if check_series_type:
assert isinstance(left, GeoSeries)
assert isinstance(left, type(right))
if check_crs:
assert left.crs == right.crs
else:
if not isinstance(left, GeoSeries):
left = GeoSeries(left)
if not isinstance(right, GeoSeries):
right = GeoSeries(right, index=left.index)
assert left.index.equals(right.index), "index: %s != %s" % (left.index, right.index)
if check_geom_type:
assert (left.geom_type == right.geom_type).all(), "type: %s != %s" % (
left.geom_type,
right.geom_type,
)
if normalize:
left = GeoSeries(left.array.normalize())
right = GeoSeries(right.array.normalize())
if not check_crs:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "CRS mismatch", UserWarning)
_check_equality(left, right, check_less_precise)
else:
_check_equality(left, right, check_less_precise)
def _truncated_string(geom):
"""Truncated WKT repr of geom"""
s = str(geom)
if len(s) > 100:
return s[:100] + "..."
else:
return s
def _check_equality(left, right, check_less_precise):
assert_error_message = (
"{0} out of {1} geometries are not {3}equal.\n"
"Indices where geometries are not {3}equal: {2} \n"
"The first not {3}equal geometry:\n"
"Left: {4}\n"
"Right: {5}\n"
)
if check_less_precise:
precise = "almost "
equal = _geom_almost_equals_mask(left, right)
else:
precise = ""
equal = _geom_equals_mask(left, right)
if not equal.all():
unequal_left_geoms = left[~equal]
unequal_right_geoms = right[~equal]
raise AssertionError(
assert_error_message.format(
len(unequal_left_geoms),
len(left),
unequal_left_geoms.index.to_list(),
precise,
_truncated_string(unequal_left_geoms.iloc[0]),
_truncated_string(unequal_right_geoms.iloc[0]),
)
)
def assert_geodataframe_equal(
left,
right,
check_dtype=True,
check_index_type="equiv",
check_column_type="equiv",
check_frame_type=True,
check_like=False,
check_less_precise=False,
check_geom_type=False,
check_crs=True,
normalize=False,
):
"""
Check that two GeoDataFrames are equal/
Parameters
----------
left, right : two GeoDataFrames
check_dtype : bool, default True
Whether to check the DataFrame dtype is identical.
check_index_type, check_column_type : bool, default 'equiv'
Check that index types are equal.
check_frame_type : bool, default True
Check that both are same type (*and* are GeoDataFrames). If False,
will attempt to convert both into GeoDataFrame.
check_like : bool, default False
If true, ignore the order of rows & columns
check_less_precise : bool, default False
If True, use geom_equals_exact. if False, use geom_equals.
check_geom_type : bool, default False
If True, check that all the geom types are equal.
check_crs: bool, default True
If `check_frame_type` is True, then also check that the
crs matches.
normalize: bool, default False
If True, normalize the geometries before comparing equality.
Typically useful with ``check_less_precise=True``, which uses
``geom_equals_exact`` and requires exact coordinate order.
"""
try:
# added from pandas 0.20
from pandas.testing import assert_frame_equal, assert_index_equal
except ImportError:
from pandas.util.testing import assert_frame_equal, assert_index_equal
# instance validation
if check_frame_type:
assert isinstance(left, GeoDataFrame)
assert isinstance(left, type(right))
if check_crs:
# allow if neither left and right has an active geometry column
if (
left._geometry_column_name is None
and right._geometry_column_name is None
):
pass
elif (
left._geometry_column_name not in left.columns
and right._geometry_column_name not in right.columns
):
pass
# no crs can be either None or {}
elif not left.crs and not right.crs:
pass
else:
assert left.crs == right.crs
else:
if not isinstance(left, GeoDataFrame):
left = GeoDataFrame(left)
if not isinstance(right, GeoDataFrame):
right = GeoDataFrame(right)
# shape comparison
assert left.shape == right.shape, (
"GeoDataFrame shape mismatch, left: {lshape!r}, right: {rshape!r}.\n"
"Left columns: {lcols!r}, right columns: {rcols!r}"
).format(
lshape=left.shape, rshape=right.shape, lcols=left.columns, rcols=right.columns
)
if check_like:
left = left.reindex_like(right)
# column comparison
assert_index_equal(
left.columns, right.columns, exact=check_column_type, obj="GeoDataFrame.columns"
)
# geometry comparison
for col, dtype in left.dtypes.items():
if isinstance(dtype, GeometryDtype):
assert_geoseries_equal(
left[col],
right[col],
normalize=normalize,
check_dtype=check_dtype,
check_less_precise=check_less_precise,
check_geom_type=check_geom_type,
check_crs=check_crs,
)
# ensure the active geometry column is the same
assert left._geometry_column_name == right._geometry_column_name
# drop geometries and check remaining columns
left2 = left.select_dtypes(exclude="geometry")
right2 = right.select_dtypes(exclude="geometry")
assert_frame_equal(
left2,
right2,
check_dtype=check_dtype,
check_index_type=check_index_type,
check_column_type=check_column_type,
obj="GeoDataFrame",
)
@@ -0,0 +1,9 @@
{
"type": "FeatureCollection",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "Name": "Null Geometry" }, "geometry": null },
{ "type": "Feature", "properties": { "Name": "SF to NY" }, "geometry": { "type": "LineString", "coordinates": [ [ -122.4051293283311, 37.786780113640894 ], [ -73.859832357849271, 40.487594916296196 ] ] } }
]
}
@@ -0,0 +1,38 @@
import subprocess
import sys
def test_no_additional_imports():
# test that 'import geopandas' does not import any of the optional or
# development dependencies
blacklist = {
"pytest",
"py",
"ipython",
# fiona actually gets imported if installed (but error suppressed until used)
# "fiona",
# "matplotlib", # matplotlib gets imported by pandas, see below
"mapclassify",
"sqlalchemy",
"psycopg",
"psycopg2",
"geopy",
"geoalchemy2",
"matplotlib",
}
code = """
import sys
import geopandas
blacklist = {0!r}
mods = blacklist & set(m.split('.')[0] for m in sys.modules)
if mods:
sys.stderr.write('err: geopandas should not import: {{}}'.format(', '.join(mods)))
sys.exit(len(mods))
""".format(
blacklist
)
call = [sys.executable, "-c", code]
returncode = subprocess.run(call, check=False).returncode
assert returncode == 0
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,30 @@
from geopandas._compat import import_optional_dependency
import pytest
def test_import_optional_dependency_present():
# pandas is not optional, but we know it is present
pandas = import_optional_dependency("pandas")
assert pandas is not None
# module imported normally must be same
import pandas as pd
assert pandas == pd
def test_import_optional_dependency_absent():
with pytest.raises(ImportError, match="Missing optional dependency 'foo'"):
import_optional_dependency("foo")
with pytest.raises(ImportError, match="foo is required"):
import_optional_dependency("foo", extra="foo is required")
@pytest.mark.parametrize(
"bad_import", [["foo"], 0, False, True, {}, {"foo"}, {"foo": "bar"}]
)
def test_import_optional_dependency_invalid(bad_import):
with pytest.raises(ValueError, match="Invalid module name"):
import_optional_dependency(bad_import)
@@ -0,0 +1,47 @@
import geopandas
import pytest
def test_options():
assert "display_precision: " in repr(geopandas.options)
assert set(dir(geopandas.options)) == {
"display_precision",
"use_pygeos",
"io_engine",
}
with pytest.raises(AttributeError):
geopandas.options.non_existing_option
with pytest.raises(AttributeError):
geopandas.options.non_existing_option = 10
def test_options_display_precision():
assert geopandas.options.display_precision is None
geopandas.options.display_precision = 5
assert geopandas.options.display_precision == 5
with pytest.raises(ValueError):
geopandas.options.display_precision = "abc"
with pytest.raises(ValueError):
geopandas.options.display_precision = -1
geopandas.options.display_precision = None
def test_options_io_engine():
assert geopandas.options.io_engine is None
geopandas.options.io_engine = "pyogrio"
assert geopandas.options.io_engine == "pyogrio"
with pytest.raises(ValueError):
geopandas.options.io_engine = "abc"
with pytest.raises(ValueError):
geopandas.options.io_engine = -1
geopandas.options.io_engine = None
@@ -0,0 +1,747 @@
import random
import warnings
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point, Polygon
from geopandas import GeoDataFrame, GeoSeries, points_from_xy, read_file
from geopandas.array import GeometryArray, from_shapely, from_wkb, from_wkt
import pytest
from geopandas.testing import assert_geodataframe_equal
pyproj = pytest.importorskip("pyproj")
def _create_df(x, y=None, crs=None):
y = y or x
x = np.asarray(x)
y = np.asarray(y)
return GeoDataFrame(
{"geometry": points_from_xy(x, y), "value1": x + y, "value2": x * y}, crs=crs
)
def df_epsg26918():
# EPSG:26918
# Center coordinates
# -1683723.64 6689139.23
return _create_df(
x=range(-1683723, -1683723 + 10, 1),
y=range(6689139, 6689139 + 10, 1),
crs="epsg:26918",
)
def test_to_crs_transform():
df = df_epsg26918()
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_transform__missing_data():
# https://github.com/geopandas/geopandas/issues/1573
df = df_epsg26918()
df.loc[3, "geometry"] = None
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_transform__empty_data():
df = df_epsg26918().iloc[:0]
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_inplace():
df = df_epsg26918()
lonlat = df.to_crs(epsg=4326)
df.to_crs(epsg=4326, inplace=True)
assert_geodataframe_equal(df, lonlat, check_less_precise=True)
def test_to_crs_geo_column_name():
# Test to_crs() with different geometry column name (GH#339)
df = df_epsg26918()
df = df.rename(columns={"geometry": "geom"})
df.set_geometry("geom", inplace=True)
lonlat = df.to_crs(epsg=4326)
utm = lonlat.to_crs(epsg=26918)
assert lonlat.geometry.name == "geom"
assert utm.geometry.name == "geom"
assert_geodataframe_equal(df, utm, check_less_precise=True)
def test_to_crs_dimension_z():
# preserve z dimension
arr = points_from_xy([1, 2], [2, 3], [3, 4], crs=4326)
assert arr.has_z.all()
result = arr.to_crs(epsg=3857)
assert result.has_z.all()
# pyproj + numpy 1.25 trigger warning for single-element array -> recommdation is to
# ignore the warning for now (https://github.com/pyproj4/pyproj/issues/1307)
@pytest.mark.filterwarnings("ignore:Conversion of an array with:DeprecationWarning")
def test_to_crs_dimension_mixed():
s = GeoSeries([Point(1, 2), LineString([(1, 2, 3), (4, 5, 6)])], crs=2056)
result = s.to_crs(epsg=4326)
assert not result[0].is_empty
assert result.has_z.tolist() == [False, True]
roundtrip = result.to_crs(epsg=2056)
# TODO replace with assert_geoseries_equal once we expose tolerance keyword
# assert_geoseries_equal(roundtrip, s, check_less_precise=True)
for a, b in zip(roundtrip, s):
np.testing.assert_allclose(a.coords[:], b.coords[:], atol=0.01)
# -----------------------------------------------------------------------------
# Test different supported formats for CRS specification
@pytest.fixture(
params=[
4326,
"epsg:4326",
pytest.param(
{"init": "epsg:4326"},
),
"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs",
{"proj": "latlong", "ellps": "WGS84", "datum": "WGS84", "no_defs": True},
],
ids=["epsg_number", "epsg_string", "epsg_dict", "proj4_string", "proj4_dict"],
)
def epsg4326(request):
if isinstance(request.param, int):
return {"epsg": request.param}
return {"crs": request.param}
@pytest.fixture(
params=[
26918,
"epsg:26918",
pytest.param(
{"init": "epsg:26918", "no_defs": True},
),
"+proj=utm +zone=18 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ",
{"proj": "utm", "zone": 18, "datum": "NAD83", "units": "m", "no_defs": True},
],
ids=["epsg_number", "epsg_string", "epsg_dict", "proj4_string", "proj4_dict"],
)
def epsg26918(request):
if isinstance(request.param, int):
return {"epsg": request.param}
return {"crs": request.param}
@pytest.mark.filterwarnings("ignore:'\\+init:DeprecationWarning")
@pytest.mark.filterwarnings("ignore:'\\+init:FutureWarning")
def test_transform2(epsg4326, epsg26918):
# with PROJ >= 7, the transformation using EPSG code vs proj4 string is
# slightly different due to use of grid files or not -> turn off network
# to not use grid files at all for this test
pyproj.network.set_network_enabled(False)
df = df_epsg26918()
lonlat = df.to_crs(**epsg4326)
utm = lonlat.to_crs(**epsg26918)
# can't check for CRS equality, as the formats differ although representing
# the same CRS
assert_geodataframe_equal(df, utm, check_less_precise=True, check_crs=False)
# pyproj + numpy 1.25 trigger warning for single-element array -> recommdation is to
# ignore the warning for now (https://github.com/pyproj4/pyproj/issues/1307)
@pytest.mark.filterwarnings("ignore:Conversion of an array with:DeprecationWarning")
def test_crs_axis_order__always_xy():
df = GeoDataFrame(geometry=[Point(-1683723, 6689139)], crs="epsg:26918")
lonlat = df.to_crs("epsg:4326")
test_lonlat = GeoDataFrame(
geometry=[Point(-110.1399901, 55.1350011)], crs="epsg:4326"
)
assert_geodataframe_equal(lonlat, test_lonlat, check_less_precise=True)
def test_skip_exact_same():
df = df_epsg26918()
utm = df.to_crs(df.crs)
assert_geodataframe_equal(df, utm, check_less_precise=True)
# Test CRS on GeometryArray level
class TestGeometryArrayCRS:
def setup_method(self):
self.osgb = pyproj.CRS(27700)
self.wgs = pyproj.CRS(4326)
self.geoms = [Point(0, 0), Point(1, 1)]
self.polys = [
Polygon([(random.random(), random.random()) for i in range(3)])
for _ in range(10)
]
self.arr = from_shapely(self.polys, crs=27700)
def test_array(self):
arr = from_shapely(self.geoms)
arr.crs = 27700
assert arr.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
assert arr.crs == self.osgb
arr = GeometryArray(arr)
assert arr.crs == self.osgb
arr = GeometryArray(arr, crs=4326)
assert arr.crs == self.wgs
def test_series(self):
s = GeoSeries(crs=27700)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
# manually change CRS
s = s.set_crs(4326, allow_override=True)
assert s.crs == self.wgs
assert s.values.crs == self.wgs
s = GeoSeries(self.geoms, crs=27700)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(arr)
assert s.crs == self.osgb
assert s.values.crs == self.osgb
with pytest.raises(
ValueError,
match="CRS mismatch between CRS of the passed geometries and 'crs'",
):
s = GeoSeries(arr, crs=4326)
assert s.crs == self.osgb
def test_dataframe(self):
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame(geometry=arr)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame(geometry=s)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# different passed CRS than array CRS is now an error
match_str = "CRS mismatch between CRS of the passed geometries and 'crs'"
with pytest.raises(ValueError, match=match_str):
df = GeoDataFrame(geometry=s, crs=4326)
with pytest.raises(ValueError, match=match_str):
GeoDataFrame(geometry=s, crs=4326)
with pytest.raises(ValueError, match=match_str):
GeoDataFrame({"data": [1, 2], "geometry": s}, crs=4326)
with pytest.raises(ValueError, match=match_str):
GeoDataFrame(df, crs=4326).crs
# manually change CRS
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame(geometry=s)
df = df.set_crs(crs="epsg:4326", allow_override=True)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
with pytest.raises(ValueError, match="Assigning CRS to a GeoDataFrame without"):
GeoDataFrame(self.geoms, columns=["geom"], crs=27700)
with pytest.raises(ValueError, match="Assigning CRS to a GeoDataFrame without"):
GeoDataFrame(crs=27700)
df = GeoDataFrame(self.geoms, columns=["geom"])
df = df.set_geometry("geom", crs=27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
assert df.geom.crs == self.osgb
assert df.geom.values.crs == self.osgb
df = GeoDataFrame(geometry=self.geoms, crs=27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# new geometry with set CRS has priority over GDF CRS
df = GeoDataFrame(geometry=self.geoms, crs=27700)
df = df.set_geometry(self.geoms, crs=4326)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame()
df = df.set_geometry(s)
assert df._geometry_column_name == "geometry"
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame()
df = df.set_geometry(arr)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr}, crs=4326)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
arr = from_shapely(self.geoms, crs=4326)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr})
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
# geometry column name None on init
df = GeoDataFrame({"geometry": [0, 1]})
with pytest.raises(
ValueError,
match="Assigning CRS to a GeoDataFrame without a geometry",
):
df.crs = 27700
# geometry column without geometry
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "Geometry column does not contain geometry", UserWarning
)
df = GeoDataFrame({"geometry": [Point(0, 1)]}).assign(geometry=[0])
with pytest.raises(
ValueError,
match="Assigning CRS to a GeoDataFrame without an active geometry",
):
df.crs = 27700
with pytest.raises(
AttributeError,
match="The CRS attribute of a GeoDataFrame without an active",
):
assert df.crs == self.osgb
def test_dataframe_getitem_without_geometry_column(self):
df = GeoDataFrame({"col": range(10)}, geometry=self.arr)
df["geom2"] = df.geometry.centroid
subset = df[["col", "geom2"]]
with pytest.raises(
AttributeError,
match="The CRS attribute of a GeoDataFrame without an active",
):
assert subset.crs == self.osgb
def test_dataframe_setitem(self):
# new geometry CRS has priority over GDF CRS
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
df = GeoDataFrame()
with pytest.warns(
FutureWarning, match="You are adding a column named 'geometry'"
):
df["geometry"] = s
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame()
with pytest.warns(
FutureWarning, match="You are adding a column named 'geometry'"
):
df["geometry"] = arr
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# test to_crs case (GH1960)
arr = from_shapely(self.geoms)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr}, crs=4326)
df["geometry"] = df["geometry"].to_crs(27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# test changing geometry crs not in the geometry column doesn't change the crs
arr = from_shapely(self.geoms)
df = GeoDataFrame(
{"col1": [1, 2], "geometry": arr, "other_geom": arr}, crs=4326
)
df["other_geom"] = from_shapely(self.geoms, crs=27700)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df["geometry"].crs == self.wgs
assert df["other_geom"].crs == self.osgb
def test_dataframe_setitem_without_geometry_column(self):
arr = from_shapely(self.geoms)
df = GeoDataFrame({"col1": [1, 2], "geometry": arr}, crs=4326)
# override geometry with non geometry
with pytest.warns(UserWarning):
df["geometry"] = 1
# assigning a list of geometry object doesn't have cached access to 4326
df["geometry"] = self.geoms
assert df.crs is None
@pytest.mark.parametrize(
"scalar", [None, Point(0, 0), LineString([(0, 0), (1, 1)])]
)
def test_scalar(self, scalar):
df = GeoDataFrame()
with pytest.warns(
FutureWarning, match="You are adding a column named 'geometry'"
):
df["geometry"] = scalar
df = df.set_crs(4326)
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
@pytest.mark.filterwarnings("ignore:Accessing CRS")
def test_crs_with_no_geom_fails(self):
with pytest.raises(ValueError, match="Assigning CRS to a GeoDataFrame without"):
df = GeoDataFrame()
df.crs = 4326
def test_read_file(self, nybb_filename):
df = read_file(nybb_filename)
assert df.crs == pyproj.CRS(2263)
assert df.geometry.crs == pyproj.CRS(2263)
assert df.geometry.values.crs == pyproj.CRS(2263)
def test_multiple_geoms(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
assert df.col1.crs == self.wgs
assert df.col1.values.crs == self.wgs
def test_multiple_geoms_set_geom(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
df = df.set_geometry("col1")
assert df.crs == self.wgs
assert df.geometry.crs == self.wgs
assert df.geometry.values.crs == self.wgs
assert df["geometry"].crs == self.osgb
assert df["geometry"].values.crs == self.osgb
def test_assign_cols(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
df["geom2"] = s
df["geom3"] = s.values
df["geom4"] = from_shapely(self.geoms)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
assert df.geom2.crs == self.wgs
assert df.geom2.values.crs == self.wgs
assert df.geom3.crs == self.wgs
assert df.geom3.values.crs == self.wgs
assert df.geom4.crs is None
assert df.geom4.values.crs is None
def test_copy(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
arr_copy = arr.copy()
assert arr_copy.crs == arr.crs
s_copy = s.copy()
assert s_copy.crs == s.crs
assert s_copy.values.crs == s.values.crs
df_copy = df.copy()
assert df_copy.crs == df.crs
assert df_copy.geometry.crs == df.geometry.crs
assert df_copy.geometry.values.crs == df.geometry.values.crs
assert df_copy.col1.crs == df.col1.crs
assert df_copy.col1.values.crs == df.col1.values.crs
def test_rename(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame(s, geometry=arr, columns=["col1"])
df = df.rename(columns={"geometry": "geom"}).set_geometry("geom")
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
df = df.rename_geometry("geom2")
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
df = df.rename(columns={"col1": "column1"})
assert df.column1.crs == self.wgs
assert df.column1.values.crs == self.wgs
def test_geoseries_to_crs(self):
s = GeoSeries(self.geoms, crs=27700)
s = s.to_crs(4326)
assert s.crs == self.wgs
assert s.values.crs == self.wgs
df = GeoDataFrame(geometry=s)
assert df.crs == self.wgs
df = df.to_crs(27700)
assert df.crs == self.osgb
assert df.geometry.crs == self.osgb
assert df.geometry.values.crs == self.osgb
# make sure that only active geometry is transformed
arr = from_shapely(self.geoms, crs=4326)
df["col1"] = arr
df = df.to_crs(3857)
assert df.col1.crs == self.wgs
assert df.col1.values.crs == self.wgs
def test_array_to_crs(self):
arr = from_shapely(self.geoms, crs=27700)
arr = arr.to_crs(4326)
assert arr.crs == self.wgs
def test_from_shapely(self):
arr = from_shapely(self.geoms, crs=27700)
assert arr.crs == self.osgb
def test_from_wkb(self):
L_wkb = [p.wkb for p in self.geoms]
arr = from_wkb(L_wkb, crs=27700)
assert arr.crs == self.osgb
def test_from_wkt(self):
L_wkt = [p.wkt for p in self.geoms]
arr = from_wkt(L_wkt, crs=27700)
assert arr.crs == self.osgb
def test_points_from_xy(self):
df = pd.DataFrame([{"x": x, "y": x, "z": x} for x in range(10)])
arr = points_from_xy(df["x"], df["y"], crs=27700)
assert arr.crs == self.osgb
# setting CRS in GeoSeries should not set it in passed array without CRS
def test_original(self):
arr = from_shapely(self.geoms)
s = GeoSeries(arr, crs=27700)
assert arr.crs is None
assert s.crs == self.osgb
def test_ops(self):
arr = self.arr
bound = arr.boundary
assert bound.crs == self.osgb
cent = arr.centroid
assert cent.crs == self.osgb
hull = arr.convex_hull
assert hull.crs == self.osgb
envelope = arr.envelope
assert envelope.crs == self.osgb
exterior = arr.exterior
assert exterior.crs == self.osgb
representative_point = arr.representative_point()
assert representative_point.crs == self.osgb
def test_binary_ops(self):
arr = self.arr
quads = []
while len(quads) < 10:
geom = Polygon([(random.random(), random.random()) for i in range(4)])
if geom.is_valid:
quads.append(geom)
arr2 = from_shapely(quads, crs=27700)
difference = arr.difference(arr2)
assert difference.crs == self.osgb
intersection = arr.intersection(arr2)
assert intersection.crs == self.osgb
symmetric_difference = arr.symmetric_difference(arr2)
assert symmetric_difference.crs == self.osgb
union = arr.union(arr2)
assert union.crs == self.osgb
def test_other(self):
arr = self.arr
buffer = arr.buffer(5)
assert buffer.crs == self.osgb
interpolate = arr.exterior.interpolate(0.1)
assert interpolate.crs == self.osgb
simplify = arr.simplify(5)
assert simplify.crs == self.osgb
@pytest.mark.parametrize(
"attr, arg",
[
("affine_transform", ([0, 1, 1, 0, 0, 0],)),
("translate", ()),
("rotate", (10,)),
("scale", ()),
("skew", ()),
],
)
def test_affinity_methods(self, attr, arg):
result = getattr(self.arr, attr)(*arg)
assert result.crs == self.osgb
def test_slice(self):
s = GeoSeries(self.arr, crs=27700)
assert s.iloc[1:].values.crs == self.osgb
df = GeoDataFrame({"col1": self.arr}, geometry=s)
assert df.iloc[1:].geometry.values.crs == self.osgb
assert df.iloc[1:].col1.values.crs == self.osgb
def test_concat(self):
s = GeoSeries(self.arr, crs=27700)
assert pd.concat([s, s]).values.crs == self.osgb
df = GeoDataFrame({"col1": from_shapely(self.geoms, crs=4326)}, geometry=s)
assert pd.concat([df, df]).geometry.values.crs == self.osgb
assert pd.concat([df, df]).col1.values.crs == self.wgs
def test_merge(self):
arr = from_shapely(self.geoms, crs=27700)
s = GeoSeries(self.geoms, crs=4326)
df = GeoDataFrame({"col1": s}, geometry=arr)
df2 = GeoDataFrame({"col2": s}, geometry=arr).rename_geometry("geom")
merged = df.merge(df2, left_index=True, right_index=True)
assert merged.col1.values.crs == self.wgs
assert merged.geometry.values.crs == self.osgb
assert merged.col2.values.crs == self.wgs
assert merged.geom.values.crs == self.osgb
assert merged.crs == self.osgb
# make sure that geometry column from list has CRS (__setitem__)
def test_setitem_geometry(self):
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame({"col1": [0, 1]}, geometry=arr)
df["geometry"] = list(df.geometry)
assert df.geometry.values.crs == self.osgb
df2 = GeoDataFrame({"col1": [0, 1]}, geometry=arr)
df2["geometry"] = from_shapely(self.geoms, crs=4326)
assert df2.geometry.values.crs == self.wgs
def test_astype(self):
arr = from_shapely(self.geoms, crs=27700)
df = GeoDataFrame({"col1": [0, 1]}, geometry=arr)
df2 = df.astype({"col1": str})
assert df2.crs == self.osgb
def test_apply(self):
s = GeoSeries(self.arr)
assert s.crs == 27700
# apply preserves the CRS if the result is a GeoSeries
result = s.apply(lambda x: x.centroid)
assert result.crs == 27700
def test_apply_geodataframe(self):
df = GeoDataFrame({"col1": [0, 1]}, geometry=self.geoms, crs=27700)
assert df.crs == 27700
# apply preserves the CRS if the result is a GeoDataFrame
result = df.apply(lambda col: col, axis=0)
assert result.crs == 27700
result = df.apply(lambda row: row, axis=1)
assert result.crs == 27700
class TestSetCRS:
@pytest.mark.parametrize(
"constructor",
[
lambda geoms, crs: GeoSeries(geoms, crs=crs),
lambda geoms, crs: GeoDataFrame(geometry=geoms, crs=crs),
],
ids=["geoseries", "geodataframe"],
)
def test_set_crs(self, constructor):
naive = constructor([Point(0, 0), Point(1, 1)], crs=None)
assert naive.crs is None
# by default returns a copy
result = naive.set_crs(crs="EPSG:4326")
assert result.crs == "EPSG:4326"
assert naive.crs is None
result = naive.set_crs(epsg=4326)
assert result.crs == "EPSG:4326"
assert naive.crs is None
# with inplace=True
result = naive.set_crs(crs="EPSG:4326", inplace=True)
assert result is naive
assert result.crs == naive.crs == "EPSG:4326"
# raise for non-naive when crs would be overridden
non_naive = constructor([Point(0, 0), Point(1, 1)], crs="EPSG:4326")
assert non_naive.crs == "EPSG:4326"
with pytest.raises(ValueError, match="already has a CRS"):
non_naive.set_crs("EPSG:3857")
# allow for equal crs
result = non_naive.set_crs("EPSG:4326")
assert result.crs == "EPSG:4326"
# replace with allow_override=True
result = non_naive.set_crs("EPSG:3857", allow_override=True)
assert non_naive.crs == "EPSG:4326"
assert result.crs == "EPSG:3857"
result = non_naive.set_crs("EPSG:3857", allow_override=True, inplace=True)
assert non_naive.crs == "EPSG:3857"
assert result.crs == "EPSG:3857"
# set CRS to None
result = non_naive.set_crs(crs=None, allow_override=True)
assert result.crs is None
assert non_naive.crs == "EPSG:3857"
@@ -0,0 +1,15 @@
from geopandas import GeoDataFrame, read_file
from geopandas.datasets import get_path
import pytest
@pytest.mark.parametrize(
"test_dataset", ["naturalearth_lowres", "naturalearth_cities", "nybb", "foo"]
)
def test_read_paths(test_dataset):
with pytest.raises(
AttributeError,
match=r"The geopandas\.dataset has been deprecated and was removed",
):
assert isinstance(read_file(get_path(test_dataset)), GeoDataFrame)
@@ -0,0 +1,87 @@
from textwrap import dedent
from geopandas._decorator import doc
@doc(method="cumsum", operation="sum")
def cumsum(whatever):
"""
This is the {method} method.
It computes the cumulative {operation}.
"""
@doc(
cumsum,
dedent(
"""
Examples
--------
>>> cumavg([1, 2, 3])
2
"""
),
method="cumavg",
operation="average",
)
def cumavg(whatever): ...
@doc(cumsum, method="cummax", operation="maximum")
def cummax(whatever): ...
@doc(cummax, method="cummin", operation="minimum")
def cummin(whatever): ...
def test_docstring_formatting():
docstr = dedent(
"""
This is the cumsum method.
It computes the cumulative sum.
"""
)
assert cumsum.__doc__ == docstr
def test_docstring_appending():
docstr = dedent(
"""
This is the cumavg method.
It computes the cumulative average.
Examples
--------
>>> cumavg([1, 2, 3])
2
"""
)
assert cumavg.__doc__ == docstr
def test_doc_template_from_func():
docstr = dedent(
"""
This is the cummax method.
It computes the cumulative maximum.
"""
)
assert cummax.__doc__ == docstr
def test_inherit_doc_template():
docstr = dedent(
"""
This is the cummin method.
It computes the cumulative minimum.
"""
)
assert cummin.__doc__ == docstr
@@ -0,0 +1,372 @@
import warnings
import numpy as np
import pandas as pd
import geopandas
from geopandas import GeoDataFrame, read_file
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_15, PANDAS_GE_20, PANDAS_GE_30
import pytest
from geopandas.testing import assert_geodataframe_equal, geom_almost_equals
from pandas.testing import assert_frame_equal
@pytest.fixture
def nybb_polydf(nybb_filename):
nybb_polydf = read_file(nybb_filename)
nybb_polydf = nybb_polydf[["geometry", "BoroName", "BoroCode"]]
nybb_polydf = nybb_polydf.rename(columns={"geometry": "myshapes"})
nybb_polydf = nybb_polydf.set_geometry("myshapes")
nybb_polydf["manhattan_bronx"] = 5
nybb_polydf.loc[3:4, "manhattan_bronx"] = 6
nybb_polydf["BoroCode"] = nybb_polydf["BoroCode"].astype("int64")
return nybb_polydf
@pytest.fixture
def merged_shapes(nybb_polydf):
# Merged geometry
manhattan_bronx = nybb_polydf.loc[3:4]
others = nybb_polydf.loc[0:2]
collapsed = [others.geometry.union_all(), manhattan_bronx.geometry.union_all()]
merged_shapes = GeoDataFrame(
{"myshapes": collapsed},
geometry="myshapes",
index=pd.Index([5, 6], name="manhattan_bronx"),
crs=nybb_polydf.crs,
)
return merged_shapes
@pytest.fixture
def first(merged_shapes):
first = merged_shapes.copy()
first["BoroName"] = ["Staten Island", "Manhattan"]
first["BoroCode"] = [5, 1]
return first
@pytest.fixture
def expected_mean(merged_shapes):
test_mean = merged_shapes.copy()
test_mean["BoroCode"] = [4, 1.5]
return test_mean
def test_geom_dissolve(nybb_polydf, first):
test = nybb_polydf.dissolve("manhattan_bronx")
assert test.geometry.name == "myshapes"
assert geom_almost_equals(test, first)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
def test_dissolve_retains_existing_crs(nybb_polydf):
assert nybb_polydf.crs is not None
test = nybb_polydf.dissolve("manhattan_bronx")
assert test.crs is not None
def test_dissolve_retains_nonexisting_crs(nybb_polydf):
nybb_polydf.geometry.array.crs = None
test = nybb_polydf.dissolve("manhattan_bronx")
assert test.crs is None
def test_first_dissolve(nybb_polydf, first):
test = nybb_polydf.dissolve("manhattan_bronx")
assert_frame_equal(first, test, check_column_type=False)
def test_mean_dissolve(nybb_polydf, first, expected_mean):
if not PANDAS_GE_15:
test = nybb_polydf.dissolve("manhattan_bronx", aggfunc="mean")
test2 = nybb_polydf.dissolve("manhattan_bronx", aggfunc=np.mean)
elif PANDAS_GE_15 and not PANDAS_GE_20:
with pytest.warns(FutureWarning, match=".*used in dissolve is deprecated.*"):
test = nybb_polydf.dissolve("manhattan_bronx", aggfunc="mean")
test2 = nybb_polydf.dissolve("manhattan_bronx", aggfunc=np.mean)
else: # pandas 2.0
test = nybb_polydf.dissolve(
"manhattan_bronx", aggfunc="mean", numeric_only=True
)
# for non pandas "mean", numeric only cannot be applied. Drop columns manually
test2 = nybb_polydf.drop(columns=["BoroName"]).dissolve(
"manhattan_bronx", aggfunc="mean"
)
assert_frame_equal(expected_mean, test, check_column_type=False)
assert_frame_equal(expected_mean, test2, check_column_type=False)
@pytest.mark.skipif(not PANDAS_GE_15 or PANDAS_GE_20, reason="warning for pandas 1.5.x")
def test_mean_dissolve_warning_capture(nybb_polydf, first, expected_mean):
with pytest.warns(
FutureWarning,
match=".*used in dissolve is deprecated.*",
):
nybb_polydf.dissolve("manhattan_bronx", aggfunc="mean")
# test no warning for aggfunc first which doesn't have numeric only semantics
with warnings.catch_warnings():
warnings.simplefilter("error")
nybb_polydf.dissolve("manhattan_bronx", aggfunc="first")
def test_dissolve_emits_other_warnings(nybb_polydf):
# we only do something special for pandas 1.5.x, but expect this
# test to be true on any version
def sum_and_warn(group):
warnings.warn("foo") # noqa: B028
if PANDAS_GE_20:
return group.sum(numeric_only=False)
else:
return group.sum()
with pytest.warns(UserWarning, match="foo"):
nybb_polydf.dissolve("manhattan_bronx", aggfunc=sum_and_warn)
def test_multicolumn_dissolve(nybb_polydf, first):
multi = nybb_polydf.copy()
multi["dup_col"] = multi.manhattan_bronx
multi_test = multi.dissolve(["manhattan_bronx", "dup_col"], aggfunc="first")
first_copy = first.copy()
first_copy["dup_col"] = first_copy.index
first_copy = first_copy.set_index([first_copy.index, "dup_col"])
assert_frame_equal(multi_test, first_copy, check_column_type=False)
def test_reset_index(nybb_polydf, first):
test = nybb_polydf.dissolve("manhattan_bronx", as_index=False)
comparison = first.reset_index()
assert_frame_equal(comparison, test, check_column_type=False)
def test_dissolve_none(nybb_polydf):
test = nybb_polydf.dissolve(by=None)
expected = GeoDataFrame(
{
nybb_polydf.geometry.name: [nybb_polydf.geometry.union_all()],
"BoroName": ["Staten Island"],
"BoroCode": [5],
"manhattan_bronx": [5],
},
geometry=nybb_polydf.geometry.name,
crs=nybb_polydf.crs,
)
assert_frame_equal(expected, test, check_column_type=False)
def test_dissolve_none_mean(nybb_polydf):
test = nybb_polydf.dissolve(aggfunc="mean", numeric_only=True)
expected = GeoDataFrame(
{
nybb_polydf.geometry.name: [nybb_polydf.geometry.union_all()],
"BoroCode": [3.0],
"manhattan_bronx": [5.4],
},
geometry=nybb_polydf.geometry.name,
crs=nybb_polydf.crs,
)
assert_frame_equal(expected, test, check_column_type=False)
def test_dissolve_level():
gdf = geopandas.GeoDataFrame(
{
"a": [1, 1, 2, 2],
"b": [3, 4, 4, 4],
"c": [3, 4, 5, 6],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)", "POINT (3 3)"]
),
}
).set_index(["a", "b", "c"])
expected_a = geopandas.GeoDataFrame(
{
"a": [1, 2],
"geometry": geopandas.array.from_wkt(
["MULTIPOINT (0 0, 1 1)", "MULTIPOINT (2 2, 3 3)"]
),
}
).set_index("a")
expected_b = geopandas.GeoDataFrame(
{
"b": [3, 4],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "MULTIPOINT (1 1, 2 2, 3 3)"]
),
}
).set_index("b")
expected_ab = geopandas.GeoDataFrame(
{
"a": [1, 1, 2],
"b": [3, 4, 4],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "MULTIPOINT (2 2, 3 3)"]
),
}
).set_index(["a", "b"])
assert_frame_equal(expected_a, gdf.dissolve(level=0))
assert_frame_equal(expected_a, gdf.dissolve(level="a"))
assert_frame_equal(expected_b, gdf.dissolve(level=1))
assert_frame_equal(expected_b, gdf.dissolve(level="b"))
assert_frame_equal(expected_ab, gdf.dissolve(level=[0, 1]))
assert_frame_equal(expected_ab, gdf.dissolve(level=["a", "b"]))
def test_dissolve_sort():
gdf = geopandas.GeoDataFrame(
{
"a": [2, 1, 1],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)"]
),
}
)
expected_unsorted = geopandas.GeoDataFrame(
{
"a": [2, 1],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "MULTIPOINT (1 1, 2 2)"]
),
}
).set_index("a")
expected_sorted = expected_unsorted.sort_index()
assert_frame_equal(expected_sorted, gdf.dissolve("a"))
assert_frame_equal(expected_unsorted, gdf.dissolve("a", sort=False))
def test_dissolve_categorical():
gdf = geopandas.GeoDataFrame(
{
"cat": pd.Categorical(["a", "a", "b", "b"]),
"noncat": [1, 1, 1, 2],
"to_agg": [1, 2, 3, 4],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)", "POINT (3 3)"]
),
}
)
# when observed=False we get an additional observation
# that wasn't in the original data
none_val = "GEOMETRYCOLLECTION EMPTY" if PANDAS_GE_30 else None
expected_gdf_observed_false = geopandas.GeoDataFrame(
{
"cat": pd.Categorical(["a", "a", "b", "b"]),
"noncat": [1, 2, 1, 2],
"geometry": geopandas.array.from_wkt(
[
"MULTIPOINT (0 0, 1 1)",
none_val,
"POINT (2 2)",
"POINT (3 3)",
]
),
"to_agg": [1, None, 3, 4],
}
).set_index(["cat", "noncat"])
# when observed=True we do not get any additional observations
expected_gdf_observed_true = geopandas.GeoDataFrame(
{
"cat": pd.Categorical(["a", "b", "b"]),
"noncat": [1, 1, 2],
"geometry": geopandas.array.from_wkt(
["MULTIPOINT (0 0, 1 1)", "POINT (2 2)", "POINT (3 3)"]
),
"to_agg": [1, 3, 4],
}
).set_index(["cat", "noncat"])
assert_frame_equal(expected_gdf_observed_false, gdf.dissolve(["cat", "noncat"]))
assert_frame_equal(
expected_gdf_observed_true, gdf.dissolve(["cat", "noncat"], observed=True)
)
def test_dissolve_dropna():
gdf = geopandas.GeoDataFrame(
{
"a": [1, 1, None],
"geometry": geopandas.array.from_wkt(
["POINT (0 0)", "POINT (1 1)", "POINT (2 2)"]
),
}
)
expected_with_na = geopandas.GeoDataFrame(
{
"a": [1.0, np.nan],
"geometry": geopandas.array.from_wkt(
["MULTIPOINT (0 0, 1 1)", "POINT (2 2)"]
),
}
).set_index("a")
expected_no_na = geopandas.GeoDataFrame(
{
"a": [1.0],
"geometry": geopandas.array.from_wkt(["MULTIPOINT (0 0, 1 1)"]),
}
).set_index("a")
assert_frame_equal(expected_with_na, gdf.dissolve("a", dropna=False))
assert_frame_equal(expected_no_na, gdf.dissolve("a"))
def test_dissolve_dropna_warn(nybb_polydf):
# No warning with default params
with warnings.catch_warnings(record=True) as record:
nybb_polydf.dissolve()
for r in record:
assert "dropna kwarg is not supported" not in str(r.message)
def test_dissolve_multi_agg(nybb_polydf, merged_shapes):
merged_shapes[("BoroCode", "min")] = [3, 1]
merged_shapes[("BoroCode", "max")] = [5, 2]
merged_shapes[("BoroName", "count")] = [3, 2]
with warnings.catch_warnings(record=True) as record:
test = nybb_polydf.dissolve(
by="manhattan_bronx",
aggfunc={
"BoroCode": ["min", "max"],
"BoroName": "count",
},
)
assert_geodataframe_equal(test, merged_shapes)
assert len(record) == 0
def test_coverage_dissolve(nybb_polydf):
manhattan_bronx = nybb_polydf.loc[3:4]
others = nybb_polydf.loc[0:2]
collapsed = [
others.geometry.union_all(method="coverage"),
manhattan_bronx.geometry.union_all(method="coverage"),
]
merged_shapes = GeoDataFrame(
{"myshapes": collapsed},
geometry="myshapes",
index=pd.Index([5, 6], name="manhattan_bronx"),
crs=nybb_polydf.crs,
)
merged_shapes["BoroName"] = ["Staten Island", "Manhattan"]
merged_shapes["BoroCode"] = [5, 1]
test = nybb_polydf.dissolve("manhattan_bronx", method="coverage")
assert_frame_equal(merged_shapes, test, check_column_type=False)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,648 @@
"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite (by inheriting the pandas test suite), and should
contain no other tests.
Other tests (eg related to the spatial functionality or integration
with GeoSeries/GeoDataFrame) should be added to test_array.py and others.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
A set of fixtures are defined to provide data for the tests (the fixtures
expected to be available to pytest by the inherited pandas tests).
"""
import itertools
import operator
import numpy as np
import pandas as pd
from pandas.tests.extension import base as extension_tests
import shapely.geometry
from shapely.geometry import Point
from geopandas._compat import PANDAS_GE_15, PANDAS_GE_21, PANDAS_GE_22
from geopandas.array import GeometryArray, GeometryDtype, from_shapely
import pytest
from pandas.testing import assert_frame_equal, assert_series_equal
# -----------------------------------------------------------------------------
# Compat with extension tests in older pandas versions
# -----------------------------------------------------------------------------
not_yet_implemented = pytest.mark.skip(reason="Not yet implemented")
no_minmax = pytest.mark.skip(reason="Min/max not supported")
# -----------------------------------------------------------------------------
# Required fixtures
# -----------------------------------------------------------------------------
@pytest.fixture
def dtype():
"""A fixture providing the ExtensionDtype to validate."""
return GeometryDtype()
def make_data():
a = np.empty(100, dtype=object)
a[:] = [shapely.geometry.Point(i, i) for i in range(100)]
ga = from_shapely(a)
return ga
@pytest.fixture
def data():
"""Length-100 array for this type.
* data[0] and data[1] should both be non missing
* data[0] and data[1] should not be equal
"""
return make_data()
@pytest.fixture
def data_for_twos():
"""Length-100 array in which all the elements are two."""
raise NotImplementedError
@pytest.fixture
def data_missing():
"""Length-2 array with [NA, Valid]"""
return from_shapely([None, shapely.geometry.Point(1, 1)])
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture giving 'data' and 'data_missing'"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing
@pytest.fixture
def data_repeated(data):
"""
Generate many datasets.
Parameters
----------
data : fixture implementing `data`
Returns
-------
Callable[[int], Generator]:
A callable that takes a `count` argument and
returns a generator yielding `count` datasets.
"""
def gen(count):
for _ in range(count):
yield data
return gen
@pytest.fixture
def data_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
return from_shapely([Point(0, 1), Point(1, 1), Point(0, 0)])
@pytest.fixture
def data_missing_for_sorting():
"""Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
return from_shapely([Point(1, 2), None, Point(0, 0)])
@pytest.fixture
def na_cmp():
"""Binary operator for comparing NA values.
Should return a function of two arguments that returns
True if both arguments are (scalar) NA for your type.
By default, uses ``operator.or``
"""
return lambda x, y: x is None and y is None
@pytest.fixture
def na_value():
"""The scalar missing value for this type. Default 'None'"""
return None
@pytest.fixture
def data_for_grouping():
"""Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
return from_shapely(
[
shapely.geometry.Point(1, 1),
shapely.geometry.Point(1, 1),
None,
None,
shapely.geometry.Point(0, 0),
shapely.geometry.Point(0, 0),
shapely.geometry.Point(1, 1),
shapely.geometry.Point(2, 2),
]
)
@pytest.fixture(params=[True, False])
def box_in_series(request):
"""Whether to box the data in a Series"""
return request.param
@pytest.fixture(
params=[
lambda x: 1,
lambda x: [1] * len(x),
lambda x: pd.Series([1] * len(x)),
lambda x: x,
],
ids=["scalar", "list", "series", "object"],
)
def groupby_apply_op(request):
"""
Functions to test groupby.apply().
"""
return request.param
@pytest.fixture(params=[True, False])
def as_frame(request):
"""
Boolean fixture to support Series and Series.to_frame() comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_series(request):
"""
Boolean fixture to support arr and Series(arr) comparison testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def use_numpy(request):
"""
Boolean fixture to support comparison testing of ExtensionDtype array
and numpy array.
"""
return request.param
@pytest.fixture(params=["ffill", "bfill"])
def fillna_method(request):
"""
Parametrized fixture giving method parameters 'ffill' and 'bfill' for
Series.fillna(method=<method>) testing.
"""
return request.param
@pytest.fixture(params=[True, False])
def as_array(request):
"""
Boolean fixture to support ExtensionDtype _from_sequence method testing.
"""
return request.param
@pytest.fixture
def invalid_scalar(data):
"""
A scalar that *cannot* be held by this ExtensionArray.
The default should work for most subclasses, but is not guaranteed.
If the array can hold any item (i.e. object dtype), then use pytest.skip.
"""
return object.__new__(object)
# Fixtures defined in pandas/conftest.py that are also needed: defining them
# here instead of importing for compatibility
@pytest.fixture(
params=["sum", "max", "min", "mean", "prod", "std", "var", "median", "kurt", "skew"]
)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names
"""
return request.param
@pytest.fixture(params=["all", "any"])
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names
"""
return request.param
# only == and != are support for GeometryArray
# @pytest.fixture(params=["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"])
@pytest.fixture(params=["__eq__", "__ne__"])
def all_compare_operators(request):
"""
Fixture for dunder names for common compare operations
* >=
* >
* ==
* !=
* <
* <=
"""
return request.param
@pytest.fixture(params=[None, lambda x: x])
def sort_by_key(request):
"""
Simple fixture for testing keys in sorting methods.
Tests None (no key) and the identity key.
"""
return request.param
# -----------------------------------------------------------------------------
# Inherited tests
# -----------------------------------------------------------------------------
class TestDtype(extension_tests.BaseDtypeTests):
# additional tests
def test_array_type_with_arg(self, data, dtype):
assert dtype.construct_array_type() is GeometryArray
def test_registry(self, data, dtype):
s = pd.Series(np.asarray(data), dtype=object)
result = s.astype("geometry")
assert isinstance(result.array, GeometryArray)
expected = pd.Series(data)
assert_series_equal(result, expected)
class TestInterface(extension_tests.BaseInterfaceTests):
def test_contains(self, data, data_missing):
# overridden due to the inconsistency between
# GeometryDtype.na_value = np.nan
# and None being used as NA in array
# ensure data without missing values
data = data[~data.isna()]
# first elements are non-missing
assert data[0] in data
assert data_missing[0] in data_missing
assert None in data_missing
assert None not in data
assert pd.NaT not in data_missing
class TestConstructors(extension_tests.BaseConstructorsTests):
pass
class TestReshaping(extension_tests.BaseReshapingTests):
# NOTE: this test is copied from pandas/tests/extension/base/reshaping.py
# because starting with pandas 3.0 the assert_frame_equal is strict regarding
# the exact missing value (None vs NaN)
# Our `result` uses None, but the way the `expected` is created results in
# NaNs (and specifying to use None as fill value in unstack also does not
# help)
# -> the only change compared to the upstream test is marked
@pytest.mark.parametrize(
"index",
[
# Two levels, uniform.
pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]),
# non-uniform
pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "b")]),
# three levels, non-uniform
pd.MultiIndex.from_product([("A", "B"), ("a", "b", "c"), (0, 1, 2)]),
pd.MultiIndex.from_tuples(
[
("A", "a", 1),
("A", "b", 0),
("A", "a", 0),
("B", "a", 0),
("B", "c", 1),
]
),
],
)
@pytest.mark.parametrize("obj", ["series", "frame"])
def test_unstack(self, data, index, obj):
data = data[: len(index)]
if obj == "series":
ser = pd.Series(data, index=index)
else:
ser = pd.DataFrame({"A": data, "B": data}, index=index)
n = index.nlevels
levels = list(range(n))
# [0, 1, 2]
# [(0,), (1,), (2,), (0, 1), (0, 2), (1, 0), (1, 2), (2, 0), (2, 1)]
combinations = itertools.chain.from_iterable(
itertools.permutations(levels, i) for i in range(1, n)
)
for level in combinations:
result = ser.unstack(level=level)
assert all(
isinstance(result[col].array, type(data)) for col in result.columns
)
if obj == "series":
# We should get the same result with to_frame+unstack+droplevel
df = ser.to_frame()
alt = df.unstack(level=level).droplevel(0, axis=1)
assert_frame_equal(result, alt)
obj_ser = ser.astype(object)
expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
if obj == "series":
assert (expected.dtypes == object).all()
# <------------ next line is added
expected[expected.isna()] = None
# ------------->
result = result.astype(object)
assert_frame_equal(result, expected)
class TestGetitem(extension_tests.BaseGetitemTests):
pass
class TestSetitem(extension_tests.BaseSetitemTests):
pass
class TestMissing(extension_tests.BaseMissingTests):
def test_fillna_series(self, data_missing):
fill_value = data_missing[1]
ser = pd.Series(data_missing)
# Fill with a scalar
result = ser.fillna(fill_value)
expected = pd.Series(data_missing._from_sequence([fill_value, fill_value]))
assert_series_equal(result, expected)
# Fill with a series
filler = pd.Series(
from_shapely(
[
shapely.geometry.Point(1, 1),
shapely.geometry.Point(2, 2),
],
)
)
result = ser.fillna(filler)
expected = pd.Series(data_missing._from_sequence([fill_value, fill_value]))
assert_series_equal(result, expected)
# Fill with a series not affecting the missing values
filler = pd.Series(
from_shapely(
[
shapely.geometry.Point(2, 2),
shapely.geometry.Point(1, 1),
]
),
index=[10, 11],
)
result = ser.fillna(filler)
assert_series_equal(result, ser)
# More `GeoSeries.fillna` testcases are in
# `geopandas\tests\test_pandas_methods.py::test_fillna_scalar`
# and `geopandas\tests\test_pandas_methods.py::test_fillna_series`.
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_limit_pad(self, data_missing):
super().test_fillna_limit_pad(data_missing)
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_limit_backfill(self, data_missing):
super().test_fillna_limit_backfill(data_missing)
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_series_method(self, data_missing, fillna_method):
super().test_fillna_series_method(data_missing, fillna_method)
@pytest.mark.skipif(
not PANDAS_GE_21, reason="fillna method not supported with older pandas"
)
def test_fillna_no_op_returns_copy(self, data):
super().test_fillna_no_op_returns_copy(data)
if PANDAS_GE_22:
from pandas.tests.extension.base import BaseReduceTests
else:
from pandas.tests.extension.base import BaseNoReduceTests as BaseReduceTests
class TestReduce(BaseReduceTests):
@pytest.mark.skip("boolean reduce (any/all) tested in test_pandas_methods")
def test_reduce_series_boolean(self):
pass
_all_arithmetic_operators = [
"__add__",
"__radd__",
# '__sub__', '__rsub__',
"__mul__",
"__rmul__",
"__floordiv__",
"__rfloordiv__",
"__truediv__",
"__rtruediv__",
"__pow__",
"__rpow__",
"__mod__",
"__rmod__",
]
@pytest.fixture(params=_all_arithmetic_operators)
def all_arithmetic_operators(request):
"""
Fixture for dunder names for common arithmetic operations
Adapted to exclude __sub__, as this is implemented as "difference".
"""
return request.param
# an inherited test from pandas creates a Series from a list of geometries, which
# triggers the warning from Shapely, out of control of GeoPandas, so ignoring here
@pytest.mark.filterwarnings(
"ignore:The array interface is deprecated and will no longer work in Shapely 2.0"
)
class TestArithmeticOps(extension_tests.BaseArithmeticOpsTests):
@pytest.mark.skip(reason="not applicable")
def test_divmod_series_array(self, data, data_for_twos):
pass
@pytest.mark.skip(reason="not applicable")
def test_add_series_with_extension_array(self, data):
pass
# an inherited test from pandas creates a Series from a list of geometries, which
# triggers the warning from Shapely, out of control of GeoPandas, so ignoring here
@pytest.mark.filterwarnings(
"ignore:The array interface is deprecated and will no longer work in Shapely 2.0"
)
class TestComparisonOps(extension_tests.BaseComparisonOpsTests):
def _compare_other(self, s, data, op_name, other):
op = getattr(operator, op_name.strip("_"))
result = op(s, other)
expected = s.combine(other, op)
assert_series_equal(result, expected)
def test_compare_scalar(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
self._compare_other(s, data, op_name, data[0])
def test_compare_array(self, data, all_compare_operators):
op_name = all_compare_operators
s = pd.Series(data)
other = pd.Series([data[0]] * len(data))
self._compare_other(s, data, op_name, other)
class TestMethods(extension_tests.BaseMethodsTests):
@pytest.mark.skipif(
not PANDAS_GE_15, reason="sorting index not yet working with older pandas"
)
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna):
pass
@pytest.mark.skipif(
not PANDAS_GE_15, reason="sorting index not yet working with older pandas"
)
def test_value_counts_with_normalize(self, data):
pass
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending):
super().test_sort_values_frame(data_for_sorting, ascending)
@pytest.mark.skip(reason="searchsorted not supported")
def test_searchsorted(self, data_for_sorting, as_series):
pass
@not_yet_implemented
def test_combine_le(self):
pass
@pytest.mark.skip(reason="addition not supported")
def test_combine_add(self):
pass
@not_yet_implemented
def test_fillna_length_mismatch(self, data_missing):
msg = "Length of 'value' does not match."
with pytest.raises(ValueError, match=msg):
data_missing.fillna(data_missing.take([1]))
@no_minmax
def test_argmin_argmax(self):
pass
@no_minmax
def test_argmin_argmax_empty_array(self):
pass
@no_minmax
def test_argmin_argmax_all_na(self):
pass
@no_minmax
def test_argreduce_series(self):
pass
@no_minmax
def test_argmax_argmin_no_skipna_notimplemented(self):
pass
class TestCasting(extension_tests.BaseCastingTests):
pass
class TestGroupby(extension_tests.BaseGroupbyTests):
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping):
super().test_groupby_extension_agg(as_index, data_for_grouping)
def test_groupby_extension_transform(self, data_for_grouping):
super().test_groupby_extension_transform(data_for_grouping)
@pytest.mark.parametrize(
"op",
[
lambda x: 1,
lambda x: [1] * len(x),
lambda x: pd.Series([1] * len(x)),
lambda x: x,
],
ids=["scalar", "list", "series", "object"],
)
def test_groupby_extension_apply(self, data_for_grouping, op):
super().test_groupby_extension_apply(data_for_grouping, op)
class TestPrinting(extension_tests.BasePrintingTests):
pass
@not_yet_implemented
class TestParsing(extension_tests.BaseParsingTests):
pass
@@ -0,0 +1,170 @@
import pandas as pd
from shapely.geometry import Point
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ
from geopandas.tools import geocode, reverse_geocode
from geopandas.tools.geocoding import _prepare_geocode_result
import pytest
from geopandas.testing import assert_geodataframe_equal
from geopandas.tests.util import assert_geoseries_equal, mock
from pandas.testing import assert_series_equal
geopy = pytest.importorskip("geopy")
class ForwardMock(mock.MagicMock):
"""
Mock the forward geocoding function.
Returns the passed in address and (p, p+.5) where p increases
at each call
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._n = 0.0
def __call__(self, *args, **kwargs):
self.return_value = args[0], (self._n, self._n + 0.5)
self._n += 1
return super().__call__(*args, **kwargs)
class ReverseMock(mock.MagicMock):
"""
Mock the reverse geocoding function.
Returns the passed in point and 'address{p}' where p increases
at each call
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._n = 0
def __call__(self, *args, **kwargs):
self.return_value = "address{0}".format(self._n), args[0]
self._n += 1
return super().__call__(*args, **kwargs)
@pytest.fixture
def locations():
locations = ["260 Broadway, New York, NY", "77 Massachusetts Ave, Cambridge, MA"]
return locations
@pytest.fixture
def points():
points = [Point(-71.0597732, 42.3584308), Point(-77.0365305, 38.8977332)]
return points
def test_prepare_result():
# Calls _prepare_result with sample results from the geocoder call
# loop
p0 = Point(12.3, -45.6) # Treat these as lat/lon
p1 = Point(-23.4, 56.7)
d = {"a": ("address0", p0.coords[0]), "b": ("address1", p1.coords[0])}
df = _prepare_geocode_result(d)
assert type(df) is GeoDataFrame
if HAS_PYPROJ:
assert df.crs == "EPSG:4326"
assert len(df) == 2
assert "address" in df
coords = df.loc["a"]["geometry"].coords[0]
test = p0.coords[0]
# Output from the df should be lon/lat
assert coords[0] == pytest.approx(test[1])
assert coords[1] == pytest.approx(test[0])
coords = df.loc["b"]["geometry"].coords[0]
test = p1.coords[0]
assert coords[0] == pytest.approx(test[1])
assert coords[1] == pytest.approx(test[0])
def test_prepare_result_none():
p0 = Point(12.3, -45.6) # Treat these as lat/lon
d = {"a": ("address0", p0.coords[0]), "b": (None, None)}
df = _prepare_geocode_result(d)
assert type(df) is GeoDataFrame
if HAS_PYPROJ:
assert df.crs == "EPSG:4326"
assert len(df) == 2
assert "address" in df
row = df.loc["b"]
# TODO we should probably replace this with a missing value instead of point?
assert len(row["geometry"].coords) == 0
assert row["geometry"].is_empty
assert row["address"] is None
@pytest.mark.parametrize("geocode_result", (None, (None, None)))
def test_prepare_geocode_result_when_result_is(geocode_result):
result = {0: geocode_result}
expected_output = GeoDataFrame(
{"geometry": [Point()], "address": [None]},
crs="EPSG:4326",
)
output = _prepare_geocode_result(result)
assert_geodataframe_equal(output, expected_output)
def test_bad_provider_forward():
from geopy.exc import GeocoderNotFound
with pytest.raises(GeocoderNotFound):
geocode(["cambridge, ma"], "badprovider")
def test_bad_provider_reverse():
from geopy.exc import GeocoderNotFound
with pytest.raises(GeocoderNotFound):
reverse_geocode([Point(0, 0)], "badprovider")
def test_forward(locations, points):
from geopy.geocoders import Photon
for provider in ["photon", Photon]:
with mock.patch("geopy.geocoders.Photon.geocode", ForwardMock()) as m:
g = geocode(locations, provider=provider, timeout=2)
assert len(locations) == m.call_count
n = len(locations)
assert isinstance(g, GeoDataFrame)
expected = GeoSeries(
[Point(float(x) + 0.5, float(x)) for x in range(n)], crs="EPSG:4326"
)
assert_geoseries_equal(expected, g["geometry"])
assert_series_equal(g["address"], pd.Series(locations, name="address"))
def test_reverse(locations, points):
from geopy.geocoders import Photon
for provider in ["photon", Photon]:
with mock.patch("geopy.geocoders.Photon.reverse", ReverseMock()) as m:
g = reverse_geocode(points, provider=provider, timeout=2)
assert len(points) == m.call_count
assert isinstance(g, GeoDataFrame)
expected = GeoSeries(points, crs="EPSG:4326")
assert_geoseries_equal(expected, g["geometry"])
address = pd.Series(
["address" + str(x) for x in range(len(points))], name="address"
)
assert_series_equal(g["address"], address)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,747 @@
import json
import os
import random
import shutil
import tempfile
import warnings
import numpy as np
import pandas as pd
from shapely.geometry import (
GeometryCollection,
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
from shapely.geometry.base import BaseGeometry
import geopandas._compat as compat
from geopandas import GeoDataFrame, GeoSeries, clip, read_file
from geopandas.array import GeometryArray, GeometryDtype
import pytest
from geopandas.testing import assert_geoseries_equal, geom_almost_equals
from geopandas.tests.util import geom_equals
from numpy.testing import assert_array_equal
from pandas.testing import assert_index_equal, assert_series_equal
class TestSeries:
def setup_method(self):
self.tempdir = tempfile.mkdtemp()
self.t1 = Polygon([(0, 0), (1, 0), (1, 1)])
self.t2 = Polygon([(0, 0), (1, 1), (0, 1)])
self.sq = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
self.g1 = GeoSeries([self.t1, self.sq])
self.g2 = GeoSeries([self.sq, self.t1])
self.g3 = GeoSeries([self.t1, self.t2], crs="epsg:4326")
self.g4 = GeoSeries([self.t2, self.t1])
self.na = GeoSeries([self.t1, self.t2, Polygon()])
self.na_none = GeoSeries([self.t1, self.t2, None])
self.a1 = self.g1.copy()
self.a1.index = ["A", "B"]
self.a2 = self.g2.copy()
self.a2.index = ["B", "C"]
self.esb = Point(-73.9847, 40.7484)
self.sol = Point(-74.0446, 40.6893)
self.landmarks = GeoSeries([self.esb, self.sol], crs="epsg:4326")
self.l1 = LineString([(0, 0), (0, 1), (1, 1)])
self.l2 = LineString([(0, 0), (1, 0), (1, 1), (0, 1)])
self.g5 = GeoSeries([self.l1, self.l2])
self.esb3857 = Point(-8235939.130493107, 4975301.253789809)
self.sol3857 = Point(-8242607.167991625, 4966620.938285081)
self.landmarks3857 = GeoSeries([self.esb3857, self.sol3857], crs="epsg:3857")
def teardown_method(self):
shutil.rmtree(self.tempdir)
def test_copy(self):
gc = self.g3.copy()
assert type(gc) is GeoSeries
assert self.g3.name == gc.name
assert self.g3.crs == gc.crs
def test_in(self):
assert self.t1 in self.g1
assert self.sq in self.g1
assert self.t1 in self.a1
assert self.t2 in self.g3
assert self.sq not in self.g3
assert 5 not in self.g3
def test_align(self):
a1, a2 = self.a1.align(self.a2)
assert isinstance(a1, GeoSeries)
assert isinstance(a2, GeoSeries)
assert a2["A"] is None
assert a1["B"].equals(a2["B"])
assert a1["C"] is None
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
def test_align_crs(self):
a1 = self.a1.set_crs("epsg:4326")
a2 = self.a2.set_crs("epsg:31370")
res1, res2 = a1.align(a2)
assert res1.crs == "epsg:4326"
assert res2.crs == "epsg:31370"
res1, res2 = a1.align(a2.set_crs(None, allow_override=True))
assert res1.crs == "epsg:4326"
assert res2.crs is None
def test_align_mixed(self):
a1 = self.a1
s2 = pd.Series([1, 2], index=["B", "C"])
res1, res2 = a1.align(s2)
exp2 = pd.Series([np.nan, 1, 2], index=["A", "B", "C"])
assert_series_equal(res2, exp2)
def test_warning_if_not_aligned(self):
# GH-816
# Test that warning is issued when operating on non-aligned series
# _series_op
with pytest.warns(UserWarning, match="The indices .+ not equal"):
self.a1.contains(self.a2)
# _geo_op
with pytest.warns(UserWarning, match="The indices .+ not equal"):
self.a1.union(self.a2)
def test_no_warning_if_aligned(self):
# GH-816
# Test that warning is not issued when operating on aligned series
a1, a2 = self.a1.align(self.a2)
with warnings.catch_warnings(record=True) as record:
a1.contains(a2) # _series_op, explicitly aligned
self.g1.intersects(self.g2) # _series_op, implicitly aligned
a2.union(a1) # _geo_op, explicitly aligned
self.g2.intersection(self.g1) # _geo_op, implicitly aligned
user_warnings = [w for w in record if w.category is UserWarning]
assert not user_warnings, user_warnings[0].message
def test_geom_equals(self):
assert np.all(self.g1.geom_equals(self.g1))
assert_array_equal(self.g1.geom_equals(self.sq), [False, True])
def test_geom_equals_align(self):
a = self.a1.geom_equals(self.a2, align=True)
exp = pd.Series([False, True, False], index=["A", "B", "C"])
assert_series_equal(a, exp)
a = self.a1.geom_equals(self.a2, align=False)
exp = pd.Series([False, False], index=["A", "B"])
assert_series_equal(a, exp)
@pytest.mark.filterwarnings(r"ignore:The 'geom_almost_equals\(\)':FutureWarning")
def test_geom_almost_equals(self):
# TODO: test decimal parameter
assert np.all(self.g1.geom_almost_equals(self.g1))
assert_array_equal(self.g1.geom_almost_equals(self.sq), [False, True])
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"The indices of the left and right GeoSeries' are not equal",
UserWarning,
)
assert_array_equal(
self.a1.geom_almost_equals(self.a2, align=True),
[False, True, False],
)
assert_array_equal(
self.a1.geom_almost_equals(self.a2, align=False), [False, False]
)
def test_geom_equals_exact(self):
# TODO: test tolerance parameter
assert np.all(self.g1.geom_equals_exact(self.g1, 0.001))
assert_array_equal(self.g1.geom_equals_exact(self.sq, 0.001), [False, True])
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"The indices of the left and right GeoSeries' are not equal",
UserWarning,
)
assert_array_equal(
self.a1.geom_equals_exact(self.a2, 0.001, align=True),
[False, True, False],
)
assert_array_equal(
self.a1.geom_equals_exact(self.a2, 0.001, align=False), [False, False]
)
def test_equal_comp_op(self):
s = GeoSeries([Point(x, x) for x in range(3)])
res = s == Point(1, 1)
exp = pd.Series([False, True, False])
assert_series_equal(res, exp)
def test_to_file(self):
"""Test to_file and from_file"""
tempfilename = os.path.join(self.tempdir, "test.shp")
self.g3.to_file(tempfilename)
# Read layer back in?
s = GeoSeries.from_file(tempfilename)
assert all(self.g3.geom_equals(s))
# TODO: compare crs
def test_to_json(self):
"""
Test whether GeoSeries.to_json works and returns an actual json file.
"""
json_str = self.g3.to_json()
data = json.loads(json_str)
assert "id" in data["features"][0].keys()
assert "bbox" in data["features"][0].keys()
# TODO : verify the output is a valid GeoJSON.
def test_to_json_drop_id(self):
"""
Test whether GeoSeries.to_json works when drop_id is True.
"""
json_str = self.g3.to_json(drop_id=True)
data = json.loads(json_str)
assert "id" not in data["features"][0].keys()
def test_to_json_no_bbox(self):
"""
Test whether GeoSeries.to_json works when show_bbox is False.
"""
json_str = self.g3.to_json(show_bbox=False)
data = json.loads(json_str)
assert "bbox" not in data["features"][0].keys()
def test_to_json_no_bbox_drop_id(self):
"""
Test whether GeoSeries.to_json works when show_bbox is False
and drop_id is True.
"""
json_str = self.g3.to_json(show_bbox=False, drop_id=True)
data = json.loads(json_str)
assert "id" not in data["features"][0].keys()
assert "bbox" not in data["features"][0].keys()
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="Requires pyproj")
def test_to_json_wgs84(self):
"""
Test whether the wgs84 conversion works as intended.
"""
text = self.landmarks3857.to_json(to_wgs84=True)
data = json.loads(text)
assert data["type"] == "FeatureCollection"
assert "id" in data["features"][0].keys()
coord1 = data["features"][0]["geometry"]["coordinates"]
coord2 = data["features"][1]["geometry"]["coordinates"]
np.testing.assert_allclose(coord1, self.esb.coords[0])
np.testing.assert_allclose(coord2, self.sol.coords[0])
def test_to_json_wgs84_false(self):
"""
Ensure no conversion to wgs84
"""
text = self.landmarks3857.to_json()
data = json.loads(text)
coord1 = data["features"][0]["geometry"]["coordinates"]
coord2 = data["features"][1]["geometry"]["coordinates"]
assert coord1 == [-8235939.130493107, 4975301.253789809]
assert coord2 == [-8242607.167991625, 4966620.938285081]
def test_representative_point(self):
assert np.all(self.g1.contains(self.g1.representative_point()))
assert np.all(self.g2.contains(self.g2.representative_point()))
assert np.all(self.g3.contains(self.g3.representative_point()))
assert np.all(self.g4.contains(self.g4.representative_point()))
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
def test_transform(self):
utm18n = self.landmarks.to_crs(epsg=26918)
lonlat = utm18n.to_crs(epsg=4326)
assert geom_almost_equals(self.landmarks, lonlat)
with pytest.raises(ValueError):
self.g1.to_crs(epsg=4326)
with pytest.raises(ValueError):
self.landmarks.to_crs(crs=None, epsg=None)
def test_estimate_utm_crs__geographic(self):
pyproj = pytest.importorskip("pyproj")
assert self.landmarks.estimate_utm_crs() == pyproj.CRS("EPSG:32618")
assert self.landmarks.estimate_utm_crs("NAD83") == pyproj.CRS("EPSG:26918")
def test_estimate_utm_crs__projected(self):
pyproj = pytest.importorskip("pyproj")
assert self.landmarks.to_crs("EPSG:3857").estimate_utm_crs() == pyproj.CRS(
"EPSG:32618"
)
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
def test_estimate_utm_crs__out_of_bounds(self):
with pytest.raises(RuntimeError, match="Unable to determine UTM CRS"):
GeoSeries(
[Polygon([(0, 90), (1, 90), (2, 90)])], crs="EPSG:4326"
).estimate_utm_crs()
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
def test_estimate_utm_crs__missing_crs(self):
with pytest.raises(RuntimeError, match="crs must be set"):
GeoSeries([Polygon([(0, 90), (1, 90), (2, 90)])]).estimate_utm_crs()
def test_fillna(self):
# default is to fill with empty geometry
na = self.na_none.fillna()
assert isinstance(na[2], BaseGeometry)
assert na[2].is_empty
assert geom_equals(self.na_none[:2], na[:2])
# XXX: method works inconsistently for different pandas versions
# self.na_none.fillna(method='backfill')
def test_coord_slice(self):
"""Test CoordinateSlicer"""
# need some better test cases
assert geom_equals(self.g3, self.g3.cx[:, :])
assert geom_equals(self.g3[[True, False]], self.g3.cx[0.9:, :0.1])
assert geom_equals(self.g3[[False, True]], self.g3.cx[0:0.1, 0.9:1.0])
def test_coord_slice_with_zero(self):
# Test that CoordinateSlice correctly handles zero slice (#GH477).
gs = GeoSeries([Point(x, x) for x in range(-3, 4)])
assert geom_equals(gs.cx[:0, :0], gs.loc[:3])
assert geom_equals(gs.cx[:, :0], gs.loc[:3])
assert geom_equals(gs.cx[:0, :], gs.loc[:3])
assert geom_equals(gs.cx[0:, 0:], gs.loc[3:])
assert geom_equals(gs.cx[0:, :], gs.loc[3:])
assert geom_equals(gs.cx[:, 0:], gs.loc[3:])
def test_geoseries_geointerface(self):
assert self.g1.__geo_interface__["type"] == "FeatureCollection"
assert len(self.g1.__geo_interface__["features"]) == self.g1.shape[0]
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
def test_proj4strings(self):
# As string
reprojected = self.g3.to_crs("+proj=utm +zone=30")
reprojected_back = reprojected.to_crs(epsg=4326)
assert geom_almost_equals(self.g3, reprojected_back)
# As dict
reprojected = self.g3.to_crs({"proj": "utm", "zone": "30"})
reprojected_back = reprojected.to_crs(epsg=4326)
assert geom_almost_equals(self.g3, reprojected_back)
# Set to equivalent string, convert, compare to original
copy = self.g3.copy().set_crs("epsg:4326", allow_override=True)
reprojected = copy.to_crs({"proj": "utm", "zone": "30"})
reprojected_back = reprojected.to_crs(epsg=4326)
assert geom_almost_equals(self.g3, reprojected_back)
# Conversions by different format
reprojected_string = self.g3.to_crs("+proj=utm +zone=30")
reprojected_dict = self.g3.to_crs({"proj": "utm", "zone": "30"})
assert geom_almost_equals(reprojected_string, reprojected_dict)
def test_from_wkb(self):
assert_geoseries_equal(self.g1, GeoSeries.from_wkb([self.t1.wkb, self.sq.wkb]))
def test_from_wkb_on_invalid(self):
# Single point LineString hex WKB: invalid
invalid_wkb_hex = "01020000000100000000000000000008400000000000000840"
message = "point array must contain 0 or >1 elements"
with pytest.raises(Exception, match=message):
GeoSeries.from_wkb([invalid_wkb_hex], on_invalid="raise")
with pytest.warns(Warning, match=message):
res = GeoSeries.from_wkb([invalid_wkb_hex], on_invalid="warn")
assert res[0] is None
with warnings.catch_warnings():
warnings.simplefilter("error")
res = GeoSeries.from_wkb([invalid_wkb_hex], on_invalid="ignore")
assert res[0] is None
def test_from_wkb_series(self):
s = pd.Series([self.t1.wkb, self.sq.wkb], index=[1, 2])
expected = self.g1.copy()
expected.index = pd.Index([1, 2])
assert_geoseries_equal(expected, GeoSeries.from_wkb(s))
def test_from_wkb_series_with_index(self):
index = [0]
s = pd.Series([self.t1.wkb, self.sq.wkb], index=[0, 2])
expected = self.g1.reindex(index)
assert_geoseries_equal(expected, GeoSeries.from_wkb(s, index=index))
def test_from_wkt(self):
assert_geoseries_equal(self.g1, GeoSeries.from_wkt([self.t1.wkt, self.sq.wkt]))
def test_from_wkt_on_invalid(self):
# Single point LineString WKT: invalid
invalid_wkt = "LINESTRING(0 0)"
message = "point array must contain 0 or >1 elements"
with pytest.raises(Exception, match=message):
GeoSeries.from_wkt([invalid_wkt], on_invalid="raise")
with pytest.warns(Warning, match=message):
res = GeoSeries.from_wkt([invalid_wkt], on_invalid="warn")
assert res[0] is None
with warnings.catch_warnings():
warnings.simplefilter("error")
res = GeoSeries.from_wkt([invalid_wkt], on_invalid="ignore")
assert res[0] is None
def test_from_wkt_series(self):
s = pd.Series([self.t1.wkt, self.sq.wkt], index=[1, 2])
expected = self.g1.copy()
expected.index = pd.Index([1, 2])
assert_geoseries_equal(expected, GeoSeries.from_wkt(s))
def test_from_wkt_series_with_index(self):
index = [0]
s = pd.Series([self.t1.wkt, self.sq.wkt], index=[0, 2])
expected = self.g1.reindex(index)
assert_geoseries_equal(expected, GeoSeries.from_wkt(s, index=index))
def test_to_wkb(self):
assert_series_equal(pd.Series([self.t1.wkb, self.sq.wkb]), self.g1.to_wkb())
assert_series_equal(
pd.Series([self.t1.wkb_hex, self.sq.wkb_hex]), self.g1.to_wkb(hex=True)
)
def test_to_wkt(self):
assert_series_equal(pd.Series([self.t1.wkt, self.sq.wkt]), self.g1.to_wkt())
def test_clip(self, naturalearth_lowres, naturalearth_cities):
left = read_file(naturalearth_cities)
world = read_file(naturalearth_lowres)
south_america = world[world["continent"] == "South America"]
expected = clip(left.geometry, south_america)
result = left.geometry.clip(south_america)
assert_geoseries_equal(result, expected)
def test_clip_sorting(self, naturalearth_cities, naturalearth_lowres):
"""
Test sorting of geodseries when clipping.
"""
cities = read_file(naturalearth_cities)
world = read_file(naturalearth_lowres)
south_america = world[world["continent"] == "South America"]
unsorted_clipped_cities = clip(cities, south_america, sort=False)
sorted_clipped_cities = clip(cities, south_america, sort=True)
expected_sorted_index = pd.Index(
[55, 59, 62, 88, 101, 114, 122, 169, 181, 189, 210, 230, 236, 238, 239]
)
assert not (
sorted(unsorted_clipped_cities.index) == unsorted_clipped_cities.index
).all()
assert (
sorted(sorted_clipped_cities.index) == sorted_clipped_cities.index
).all()
assert_index_equal(expected_sorted_index, sorted_clipped_cities.index)
def test_from_xy_points(self):
x = self.landmarks.x.values
y = self.landmarks.y.values
index = self.landmarks.index.tolist()
crs = self.landmarks.crs
assert_geoseries_equal(
self.landmarks, GeoSeries.from_xy(x, y, index=index, crs=crs)
)
assert_geoseries_equal(
self.landmarks,
GeoSeries.from_xy(self.landmarks.x, self.landmarks.y, crs=crs),
)
def test_from_xy_points_w_z(self):
index_values = [5, 6, 7]
x = pd.Series([0, -1, 2], index=index_values)
y = pd.Series([8, 3, 1], index=index_values)
z = pd.Series([5, -6, 7], index=index_values)
expected = GeoSeries(
[Point(0, 8, 5), Point(-1, 3, -6), Point(2, 1, 7)], index=index_values
)
assert_geoseries_equal(expected, GeoSeries.from_xy(x, y, z))
def test_from_xy_points_unequal_index(self):
x = self.landmarks.x
y = self.landmarks.y
y.index = -np.arange(len(y))
crs = self.landmarks.crs
assert_geoseries_equal(
self.landmarks, GeoSeries.from_xy(x, y, index=x.index, crs=crs)
)
unindexed_landmarks = self.landmarks.copy()
unindexed_landmarks.reset_index(inplace=True, drop=True)
assert_geoseries_equal(
unindexed_landmarks,
GeoSeries.from_xy(x, y, crs=crs),
)
def test_from_xy_points_indexless(self):
x = np.array([0.0, 3.0])
y = np.array([2.0, 5.0])
z = np.array([-1.0, 4.0])
expected = GeoSeries([Point(0, 2, -1), Point(3, 5, 4)])
assert_geoseries_equal(expected, GeoSeries.from_xy(x, y, z))
@pytest.mark.skipif(compat.HAS_PYPROJ, reason="pyproj installed")
def test_set_crs_pyproj_error(self):
with pytest.raises(
ImportError, match="The 'pyproj' package is required for set_crs"
):
self.g1.set_crs(3857)
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_missing_values():
s = GeoSeries([Point(1, 1), None, np.nan, GeometryCollection(), Polygon()])
# construction -> missing values get normalized to None
assert s[1] is None
assert s[2] is None
assert s[3].is_empty
assert s[4].is_empty
# isna / is_empty
assert s.isna().tolist() == [False, True, True, False, False]
assert s.is_empty.tolist() == [False, False, False, True, True]
assert s.notna().tolist() == [True, False, False, True, True]
# fillna defaults to fill with empty geometry -> no missing values anymore
assert not s.fillna().isna().any()
# dropna drops the missing values
assert not s.dropna().isna().any()
assert len(s.dropna()) == 3
def test_isna_empty_geoseries():
# ensure that isna() result for empty GeoSeries has the correct bool dtype
s = GeoSeries([])
result = s.isna()
assert_series_equal(result, pd.Series([], dtype="bool"))
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
def test_geoseries_crs():
gs = GeoSeries().set_crs("IGNF:ETRS89UTM28")
assert gs.crs.to_authority() == ("IGNF", "ETRS89UTM28")
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="Requires pyproj")
def test_geoseries_override_existing_crs_warning():
gs = GeoSeries(crs="epsg:4326")
with pytest.warns(
DeprecationWarning,
match="Overriding the CRS of a GeoSeries that already has CRS",
):
gs.crs = "epsg:2100"
# -----------------------------------------------------------------------------
# # Constructor tests
# -----------------------------------------------------------------------------
def check_geoseries(s):
assert isinstance(s, GeoSeries)
assert isinstance(s.geometry, GeoSeries)
assert isinstance(s.dtype, GeometryDtype)
assert isinstance(s.values, GeometryArray)
class TestConstructor:
def test_constructor(self):
s = GeoSeries([Point(x, x) for x in range(3)])
check_geoseries(s)
def test_single_geom_constructor(self):
p = Point(1, 2)
line = LineString([(2, 3), (4, 5), (5, 6)])
poly = Polygon(
[(0, 0), (1, 0), (1, 1), (0, 1)], [[(0.1, 0.1), (0.9, 0.1), (0.9, 0.9)]]
)
mp = MultiPoint([(1, 2), (3, 4), (5, 6)])
mline = MultiLineString([[(1, 2), (3, 4), (5, 6)], [(7, 8), (9, 10)]])
poly2 = Polygon(
[(0, 0), (0, -1), (-1, -1), (-1, 0)],
[[(-0.1, -0.1), (-0.1, -0.5), (-0.5, -0.5), (-0.5, -0.1)]],
)
mpoly = MultiPolygon([poly, poly2])
geoms = [p, line, poly, mp, mline, mpoly]
index = ["a", "b", "c", "d"]
for g in geoms:
gs = GeoSeries(g)
assert len(gs) == 1
# accessing elements no longer give identical objects
assert gs.iloc[0].equals(g)
gs = GeoSeries(g, index=index)
assert len(gs) == len(index)
for x in gs:
assert x.equals(g)
def test_non_geometry_raises(self):
with pytest.raises(TypeError, match="Non geometry data passed to GeoSeries"):
GeoSeries([True, False, True])
with pytest.raises(TypeError, match="Non geometry data passed to GeoSeries"):
GeoSeries(["a", "b", "c"])
with pytest.raises(TypeError, match="Non geometry data passed to GeoSeries"):
GeoSeries([[1, 2], [3, 4]])
def test_empty(self):
s = GeoSeries([])
check_geoseries(s)
s = GeoSeries()
check_geoseries(s)
def test_data_is_none(self):
s = GeoSeries(index=range(3))
check_geoseries(s)
def test_empty_array(self):
# with empty data that have an explicit dtype, we use the fallback or
# not depending on the dtype
# dtypes that can never hold geometry-like data
for arr in [
np.array([], dtype="bool"),
np.array([], dtype="int64"),
np.array([], dtype="float32"),
# this gets converted to object dtype by pandas
# np.array([], dtype="str"),
]:
with pytest.raises(
TypeError, match="Non geometry data passed to GeoSeries"
):
GeoSeries(arr)
# dtypes that can potentially hold geometry-like data (object) or
# can come from empty data (float64)
for arr in [
np.array([], dtype="object"),
np.array([], dtype="float64"),
np.array([], dtype="str"),
]:
with warnings.catch_warnings(record=True) as record:
s = GeoSeries(arr)
assert not record
assert isinstance(s, GeoSeries)
def test_from_series(self):
shapes = [
Polygon([(random.random(), random.random()) for _ in range(3)])
for _ in range(10)
]
s = pd.Series(shapes, index=list("abcdefghij"), name="foo")
g = GeoSeries(s)
check_geoseries(g)
assert [a.equals(b) for a, b in zip(s, g)]
assert s.name == g.name
assert s.index is g.index
@pytest.mark.skipif(not compat.HAS_PYPROJ, reason="pyproj not available")
def test_from_series_no_set_crs_on_construction(self):
# https://github.com/geopandas/geopandas/issues/2492
# also when passing Series[geometry], ensure we don't change crs of
# original data
gs = GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)])
s = pd.Series(gs)
result = GeoSeries(s, crs=4326)
assert s.values.crs is None
assert gs.crs is None
assert result.crs == "EPSG:4326"
def test_copy(self):
# default is to copy with CoW / pandas 3+
arr = np.array([Point(x, x) for x in range(3)], dtype=object)
result = GeoSeries(arr)
# modifying result doesn't change original array
result.loc[0] = Point(10, 10)
if compat.PANDAS_GE_30 or getattr(pd.options.mode, "copy_on_write", False):
assert arr[0] == Point(0, 0)
else:
assert arr[0] == Point(10, 10)
# avoid copy with copy=False
arr = np.array([Point(x, x) for x in range(3)], dtype=object)
result = GeoSeries(arr, copy=False)
assert result.array._data.flags.writeable
# now modifying result also updates original array
result.loc[0] = Point(10, 10)
assert arr[0] == Point(10, 10)
# GH 1216
@pytest.mark.parametrize("name", [None, "geometry", "Points"])
@pytest.mark.parametrize("crs", [None, "epsg:4326"])
def test_reset_index(self, name, crs):
s = GeoSeries(
[MultiPoint([(0, 0), (1, 1)]), MultiPoint([(2, 2), (3, 3), (4, 4)])],
name=name,
crs=crs,
)
s = s.explode(index_parts=True)
df = s.reset_index()
assert type(df) == GeoDataFrame
# name None -> 0, otherwise name preserved
assert df.geometry.name == (name if name is not None else 0)
assert df.crs == s.crs
@pytest.mark.parametrize("name", [None, "geometry", "Points"])
@pytest.mark.parametrize("crs", [None, "epsg:4326"])
def test_to_frame(self, name, crs):
s = GeoSeries([Point(0, 0), Point(1, 1)], name=name, crs=crs)
df = s.to_frame()
assert type(df) == GeoDataFrame
# name None -> 0, otherwise name preserved
expected_name = name if name is not None else 0
assert df.geometry.name == expected_name
assert df._geometry_column_name == expected_name
assert df.crs == s.crs
# if name is provided to to_frame, it should override
df2 = s.to_frame(name="geom")
assert type(df) == GeoDataFrame
assert df2.geometry.name == "geom"
assert df2.crs == s.crs
def test_explode_without_multiindex(self):
s = GeoSeries(
[MultiPoint([(0, 0), (1, 1)]), MultiPoint([(2, 2), (3, 3), (4, 4)])]
)
s = s.explode(index_parts=False)
expected_index = pd.Index([0, 0, 1, 1, 1])
assert_index_equal(s.index, expected_index)
def test_explode_ignore_index(self):
s = GeoSeries(
[MultiPoint([(0, 0), (1, 1)]), MultiPoint([(2, 2), (3, 3), (4, 4)])]
)
s = s.explode(ignore_index=True)
expected_index = pd.Index(range(len(s)))
assert_index_equal(s.index, expected_index)
# index_parts is ignored if ignore_index=True
s = s.explode(index_parts=True, ignore_index=True)
assert_index_equal(s.index, expected_index)
@@ -0,0 +1,230 @@
import warnings
import pandas as pd
from shapely.geometry import Point
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_21
import pytest
from geopandas.testing import assert_geodataframe_equal
from pandas.testing import assert_index_equal
class TestMerging:
def setup_method(self):
self.gseries = GeoSeries([Point(i, i) for i in range(3)])
self.series = pd.Series([1, 2, 3])
self.gdf = GeoDataFrame({"geometry": self.gseries, "values": range(3)})
self.df = pd.DataFrame({"col1": [1, 2, 3], "col2": [0.1, 0.2, 0.3]})
def _check_metadata(self, gdf, geometry_column_name="geometry", crs=None):
assert gdf._geometry_column_name == geometry_column_name
assert gdf.crs == crs
def test_merge(self):
res = self.gdf.merge(self.df, left_on="values", right_on="col1")
# check result is a GeoDataFrame
assert isinstance(res, GeoDataFrame)
# check geometry property gives GeoSeries
assert isinstance(res.geometry, GeoSeries)
# check metadata
self._check_metadata(res)
# test that crs and other geometry name are preserved
self.gdf.crs = "epsg:4326"
self.gdf = self.gdf.rename(columns={"geometry": "points"}).set_geometry(
"points"
)
res = self.gdf.merge(self.df, left_on="values", right_on="col1")
assert isinstance(res, GeoDataFrame)
assert isinstance(res.geometry, GeoSeries)
self._check_metadata(res, "points", self.gdf.crs)
def test_concat_axis0(self):
# frame
res = pd.concat([self.gdf, self.gdf])
assert res.shape == (6, 2)
assert isinstance(res, GeoDataFrame)
assert isinstance(res.geometry, GeoSeries)
self._check_metadata(res)
exp = GeoDataFrame(pd.concat([pd.DataFrame(self.gdf), pd.DataFrame(self.gdf)]))
assert_geodataframe_equal(exp, res)
# series
res = pd.concat([self.gdf.geometry, self.gdf.geometry])
assert res.shape == (6,)
assert isinstance(res, GeoSeries)
assert isinstance(res.geometry, GeoSeries)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
def test_concat_axis0_crs(self):
# CRS not set for both GeoDataFrame
res = pd.concat([self.gdf, self.gdf])
self._check_metadata(res)
# CRS set for both GeoDataFrame, same CRS
res1 = pd.concat([self.gdf.set_crs("epsg:4326"), self.gdf.set_crs("epsg:4326")])
self._check_metadata(res1, crs="epsg:4326")
# CRS not set for one GeoDataFrame, but set for the other GeoDataFrame
with pytest.warns(
UserWarning, match=r"CRS not set for some of the concatenation inputs.*"
):
res2 = pd.concat([self.gdf, self.gdf.set_crs("epsg:4326")])
self._check_metadata(res2, crs="epsg:4326")
# CRS set for both GeoDataFrame, different CRS
with pytest.raises(
ValueError, match=r"Cannot determine common CRS for concatenation inputs.*"
):
pd.concat([self.gdf.set_crs("epsg:4326"), self.gdf.set_crs("epsg:4327")])
# CRS not set for one GeoDataFrame, but set for the other GeoDataFrames,
# same CRS
with pytest.warns(
UserWarning, match=r"CRS not set for some of the concatenation inputs.*"
):
res3 = pd.concat(
[self.gdf, self.gdf.set_crs("epsg:4326"), self.gdf.set_crs("epsg:4326")]
)
self._check_metadata(res3, crs="epsg:4326")
# CRS not set for one GeoDataFrame, but set for the other GeoDataFrames,
# different CRS
with pytest.raises(
ValueError, match=r"Cannot determine common CRS for concatenation inputs.*"
):
pd.concat(
[self.gdf, self.gdf.set_crs("epsg:4326"), self.gdf.set_crs("epsg:4327")]
)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
def test_concat_axis0_unaligned_cols(self):
# https://github.com/geopandas/geopandas/issues/2679
gdf = self.gdf.set_crs("epsg:4326").assign(
geom=self.gdf.geometry.set_crs("epsg:4327")
)
both_geom_cols = gdf[["geom", "geometry"]]
single_geom_col = gdf[["geometry"]]
with warnings.catch_warnings():
warnings.simplefilter("error")
pd.concat([both_geom_cols, single_geom_col])
# Check order of mismatch doesn't matter
with warnings.catch_warnings():
warnings.simplefilter("error")
pd.concat([single_geom_col, both_geom_cols])
# Side effect of this fix, explicitly provided all none geoseries
# will not be warned for (ideally this would still warn)
explicit_all_none_case = gdf[["geometry"]].assign(
geom=GeoSeries([None for _ in range(len(gdf))])
)
with warnings.catch_warnings():
warnings.simplefilter("error")
pd.concat([both_geom_cols, explicit_all_none_case])
# Check concat with partially None col is not affected by the special casing
# for all None no CRS handling
with pytest.warns(
UserWarning, match=r"CRS not set for some of the concatenation inputs.*"
):
partial_none_case = self.gdf[["geometry"]]
partial_none_case.iloc[0] = None
pd.concat([single_geom_col, partial_none_case])
def test_concat_axis0_crs_wkt_mismatch(self):
pyproj = pytest.importorskip("pyproj")
# https://github.com/geopandas/geopandas/issues/326#issuecomment-1727958475
wkt_template = """GEOGCRS["WGS 84",
ENSEMBLE["World Geodetic System 1984 ensemble",
MEMBER["World Geodetic System 1984 (Transit)"],
MEMBER["World Geodetic System 1984 (G730)"],
MEMBER["World Geodetic System 1984 (G873)"],
MEMBER["World Geodetic System 1984 (G1150)"],
MEMBER["World Geodetic System 1984 (G1674)"],
MEMBER["World Geodetic System 1984 (G1762)"],
MEMBER["World Geodetic System 1984 (G2139)"],
ELLIPSOID["WGS 84",6378137,298.257223563,LENGTHUNIT["metre",1]],
ENSEMBLEACCURACY[2.0]],PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
USAGE[SCOPE["Horizontal component of 3D system."],
AREA["World.{}"],BBOX[-90,-180,90,180]],ID["EPSG",4326]]"""
wkt_v1 = wkt_template.format("")
wkt_v2 = wkt_template.format(" ") # add additional whitespace
crs1 = pyproj.CRS.from_wkt(wkt_v1)
crs2 = pyproj.CRS.from_wkt(wkt_v2)
# pyproj crs __hash__ based on WKT strings means these are distinct in a
# set are but equal by equality
assert len({crs1, crs2}) == 2
assert crs1 == crs2
expected = pd.concat([self.gdf, self.gdf]).set_crs(crs1)
res = pd.concat([self.gdf.set_crs(crs1), self.gdf.set_crs(crs2)])
assert_geodataframe_equal(expected, res)
def test_concat_axis1(self):
res = pd.concat([self.gdf, self.df], axis=1)
assert res.shape == (3, 4)
assert isinstance(res, GeoDataFrame)
assert isinstance(res.geometry, GeoSeries)
self._check_metadata(res)
def test_concat_axis1_multiple_geodataframes(self):
# https://github.com/geopandas/geopandas/issues/1230
# Expect that concat should fail gracefully if duplicate column names belonging
# to geometry columns are introduced.
if PANDAS_GE_21:
# _constructor_from_mgr changes mean we now get the concat specific error
# message in this case too
expected_err = (
"Concat operation has resulted in multiple columns using the geometry "
"column name 'geometry'."
)
else:
expected_err = (
"GeoDataFrame does not support multiple columns using the geometry"
" column name 'geometry'"
)
with pytest.raises(ValueError, match=expected_err):
pd.concat([self.gdf, self.gdf], axis=1)
# Check case is handled if custom geometry column name is used
df2 = self.gdf.rename_geometry("geom")
expected_err2 = (
"Concat operation has resulted in multiple columns using the geometry "
"column name 'geom'."
)
with pytest.raises(ValueError, match=expected_err2):
pd.concat([df2, df2], axis=1)
if HAS_PYPROJ:
# Check that two geometry columns is fine, if they have different names
res3 = pd.concat([df2.set_crs("epsg:4326"), self.gdf], axis=1)
# check metadata comes from first df
self._check_metadata(res3, geometry_column_name="geom", crs="epsg:4326")
@pytest.mark.filterwarnings("ignore:Accessing CRS")
def test_concat_axis1_geoseries(self):
gseries2 = GeoSeries([Point(i, i) for i in range(3, 6)], crs="epsg:4326")
result = pd.concat([gseries2, self.gseries], axis=1)
# Note this is not consistent with concat([gdf, gdf], axis=1) where the
# left metadata is set on the result. This is deliberate for now.
assert type(result) is GeoDataFrame
assert result._geometry_column_name is None
assert_index_equal(pd.Index([0, 1]), result.columns)
gseries2.name = "foo"
result2 = pd.concat([gseries2, self.gseries], axis=1)
assert type(result2) is GeoDataFrame
assert result._geometry_column_name is None
assert_index_equal(pd.Index(["foo", 0]), result2.columns)
@@ -0,0 +1,411 @@
import numpy as np
import pandas as pd
from shapely.geometry import Point
import geopandas
from geopandas import GeoDataFrame, GeoSeries
import pytest
from geopandas.testing import assert_geodataframe_equal
pyproj = pytest.importorskip("pyproj")
crs_osgb = pyproj.CRS(27700)
crs_wgs = pyproj.CRS(4326)
N = 10
@pytest.fixture(params=["geometry", "point"])
def df(request):
geo_name = request.param
df = GeoDataFrame(
[
{
"value1": x + y,
"value2": x * y,
geo_name: Point(x, y), # rename this col in tests
}
for x, y in zip(range(N), range(N))
],
crs=crs_wgs,
geometry=geo_name,
)
# want geometry2 to be a GeoSeries not Series, test behaviour of non geom col
df["geometry2"] = df[geo_name].set_crs(crs_osgb, allow_override=True)
return df
@pytest.fixture
def df2():
"""For constructor_sliced tests"""
return GeoDataFrame(
{
"geometry": GeoSeries([Point(x, x) for x in range(3)]),
"geometry2": GeoSeries([Point(x, x) for x in range(3)]),
"geometry3": GeoSeries([Point(x, x) for x in range(3)]),
"value": [1, 2, 1],
"value_nan": np.nan,
}
)
def _check_metadata_gdf(gdf, geo_name="geometry", crs=crs_wgs):
assert gdf._geometry_column_name == geo_name
assert gdf.geometry.name == geo_name
assert gdf.crs == crs
def _check_metadata_gs(gs, name="geometry", crs=crs_wgs):
assert gs.name == name
assert gs.crs == crs
def assert_object(result, expected_type, geo_name="geometry", crs=crs_wgs):
"""
Helper method to make tests easier to read. Checks result is of the expected
type. If result is a GeoDataFrame or GeoSeries, checks geo_name
and crs match. If geo_name is None, then we expect a GeoDataFrame
where the geometry column is invalid/ isn't set. This is never desirable,
but is a reality of this first stage of implementation.
"""
assert type(result) is expected_type
if expected_type == GeoDataFrame:
assert geo_name is not None
_check_metadata_gdf(result, geo_name=geo_name, crs=crs)
elif expected_type == GeoSeries:
_check_metadata_gs(result, name=geo_name, crs=crs)
def assert_obj_no_active_geo_col(result, expected_type, geo_colname=None):
"""
Helper method to make tests easier to read. Checks result is of the expected
type. Asserts that accessing result.geometry.name raises, corresponding to
_geometry_column_name being in an invalid state
(either None, or a column no longer present)
This amounts to testing the assertion raised (geometry column is unset, vs
old geometry column is missing)
We assert that _geometry_column_name = int_geo_colname
"""
if expected_type == GeoDataFrame:
if geo_colname is None:
assert result._geometry_column_name is None
else:
assert geo_colname == result._geometry_column_name
if result._geometry_column_name is None:
msg = (
"You are calling a geospatial method on the GeoDataFrame, "
"but the active"
)
else:
msg = (
"You are calling a geospatial method on the GeoDataFrame, but "
r"the active geometry column \("
rf"'{result._geometry_column_name}'\) is not present"
)
with pytest.raises(AttributeError, match=msg):
result.geometry.name # be explicit that geometry is invalid here
else:
raise NotImplementedError()
def test_getitem(df):
geo_name = df.geometry.name
assert_object(df[["value1", "value2"]], pd.DataFrame)
assert_object(df[[geo_name, "geometry2"]], GeoDataFrame, geo_name)
assert_object(df[[geo_name]], GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(df[["geometry2", "value1"]], GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(df[["geometry2"]], GeoDataFrame, geo_name)
assert_object(df[["value1"]], pd.DataFrame)
# Series
assert_object(df[geo_name], GeoSeries, geo_name)
assert_object(df["geometry2"], GeoSeries, "geometry2", crs=crs_osgb)
assert_object(df["value1"], pd.Series)
def test_loc(df):
geo_name = df.geometry.name
assert_object(df.loc[:, ["value1", "value2"]], pd.DataFrame)
assert_object(df.loc[:, [geo_name, "geometry2"]], GeoDataFrame, geo_name)
assert_object(df.loc[:, [geo_name]], GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(
df.loc[:, ["geometry2", "value1"]], GeoDataFrame, geo_name
)
assert_obj_no_active_geo_col(df.loc[:, ["geometry2"]], GeoDataFrame, geo_name)
assert_object(df.loc[:, ["value1"]], pd.DataFrame)
# Series
assert_object(df.loc[:, geo_name], GeoSeries, geo_name)
assert_object(df.loc[:, "geometry2"], GeoSeries, "geometry2", crs=crs_osgb)
assert_object(df.loc[:, "value1"], pd.Series)
@pytest.mark.parametrize(
"geom_name",
[
"geometry",
pytest.param(
"geom",
marks=pytest.mark.xfail(
reason="pre-regression behaviour only works for geometry col geometry"
),
),
],
)
def test_loc_add_row(geom_name, nybb_filename):
# https://github.com/geopandas/geopandas/issues/3119
nybb = geopandas.read_file(nybb_filename)[["BoroCode", "geometry"]]
if geom_name != "geometry":
nybb = nybb.rename_geometry(geom_name)
# crs_orig = nybb.crs
# add a new row
nybb.loc[5] = [6, nybb.geometry.iloc[0]]
assert nybb.geometry.dtype == "geometry"
assert nybb.crs is None # TODO this should be crs_orig, regressed in #2373
def test_iloc(df):
geo_name = df.geometry.name
assert_object(df.iloc[:, 0:2], pd.DataFrame)
assert_object(df.iloc[:, 2:4], GeoDataFrame, geo_name)
assert_object(df.iloc[:, [2]], GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(df.iloc[:, [3, 0]], GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(df.iloc[:, [3]], GeoDataFrame, geo_name)
assert_object(df.iloc[:, [0]], pd.DataFrame)
# Series
assert_object(df.iloc[:, 2], GeoSeries, geo_name)
assert_object(df.iloc[:, 3], GeoSeries, "geometry2", crs=crs_osgb)
assert_object(df.iloc[:, 0], pd.Series)
def test_squeeze(df):
geo_name = df.geometry.name
assert_object(df[[geo_name]].squeeze(), GeoSeries, geo_name)
assert_object(df[["geometry2"]].squeeze(), GeoSeries, "geometry2", crs=crs_osgb)
def test_to_frame(df):
geo_name = df.geometry.name
res1 = df[geo_name].to_frame()
assert_object(res1, GeoDataFrame, geo_name, crs=df[geo_name].crs)
res2 = df["geometry2"].to_frame()
assert_object(res2, GeoDataFrame, "geometry2", crs=crs_osgb)
res3 = df["value1"].to_frame()
assert_object(res3, pd.DataFrame)
def test_reindex(df):
geo_name = df.geometry.name
assert_object(df.reindex(columns=["value1", "value2"]), pd.DataFrame)
assert_object(df.reindex(columns=[geo_name, "geometry2"]), GeoDataFrame, geo_name)
assert_object(df.reindex(columns=[geo_name]), GeoDataFrame, geo_name)
assert_object(df.reindex(columns=["new_col", geo_name]), GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(
df.reindex(columns=["geometry2", "value1"]), GeoDataFrame, geo_name
)
assert_obj_no_active_geo_col(
df.reindex(columns=["geometry2"]), GeoDataFrame, geo_name
)
assert_object(df.reindex(columns=["value1"]), pd.DataFrame)
# reindexing the rows always preserves the GeoDataFrame
assert_object(df.reindex(index=[0, 1, 20]), GeoDataFrame, geo_name)
# reindexing both rows and columns
assert_object(
df.reindex(index=[0, 1, 20], columns=[geo_name]), GeoDataFrame, geo_name
)
assert_object(df.reindex(index=[0, 1, 20], columns=["value1"]), pd.DataFrame)
def test_drop(df):
geo_name = df.geometry.name
assert_object(df.drop(columns=[geo_name, "geometry2"]), pd.DataFrame)
assert_object(df.drop(columns=["value1", "value2"]), GeoDataFrame, geo_name)
cols = ["value1", "value2", "geometry2"]
assert_object(df.drop(columns=cols), GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(
df.drop(columns=[geo_name, "value2"]), GeoDataFrame, geo_name
)
assert_obj_no_active_geo_col(
df.drop(columns=["value1", "value2", geo_name]), GeoDataFrame, geo_name
)
assert_object(df.drop(columns=["geometry2", "value2", geo_name]), pd.DataFrame)
def test_apply(df):
geo_name = df.geometry.name
def identity(x):
return x
# axis = 0
assert_object(df[["value1", "value2"]].apply(identity), pd.DataFrame)
assert_object(df[[geo_name, "geometry2"]].apply(identity), GeoDataFrame, geo_name)
assert_object(df[[geo_name]].apply(identity), GeoDataFrame, geo_name)
res = df[["geometry2", "value1"]].apply(identity)
assert_obj_no_active_geo_col(res, GeoDataFrame, geo_name)
assert_obj_no_active_geo_col(
df[["geometry2"]].apply(identity), GeoDataFrame, geo_name
)
assert_object(df[["value1"]].apply(identity), pd.DataFrame)
# axis = 0, Series
assert_object(df[geo_name].apply(identity), GeoSeries, geo_name)
assert_object(df["geometry2"].apply(identity), GeoSeries, "geometry2", crs=crs_osgb)
assert_object(df["value1"].apply(identity), pd.Series)
# axis = 0, Series, no longer geometry
assert_object(df[geo_name].apply(lambda x: str(x)), pd.Series)
assert_object(df["geometry2"].apply(lambda x: str(x)), pd.Series)
# axis = 1
assert_object(df[["value1", "value2"]].apply(identity, axis=1), pd.DataFrame)
assert_object(
df[[geo_name, "geometry2"]].apply(identity, axis=1), GeoDataFrame, geo_name
)
assert_object(df[[geo_name]].apply(identity, axis=1), GeoDataFrame, geo_name)
# TODO below should be a GeoDataFrame to be consistent with new getitem logic
# leave as follow up as quite complicated
# FrameColumnApply.series_generator returns object dtypes Series, so will have
# patch result of apply
assert_object(df[["geometry2", "value1"]].apply(identity, axis=1), pd.DataFrame)
assert_object(df[["value1"]].apply(identity, axis=1), pd.DataFrame)
def test_apply_axis1_secondary_geo_cols(df):
geo_name = df.geometry.name
def identity(x):
return x
assert_obj_no_active_geo_col(
df[["geometry2"]].apply(identity, axis=1), GeoDataFrame, geo_name
)
def test_expanddim_in_apply():
# https://github.com/geopandas/geopandas/pull/2296#issuecomment-1021966443
s = GeoSeries.from_xy([0, 1], [0, 1])
result = s.apply(lambda x: pd.Series([x.x, x.y]))
assert_object(result, pd.DataFrame)
def test_expandim_in_groupby_aggregate_multiple_funcs():
# https://github.com/geopandas/geopandas/pull/2296#issuecomment-1021966443
# There are two calls to _constructor_expanddim here
# SeriesGroupBy._aggregate_multiple_funcs() and
# SeriesGroupBy._wrap_series_output() len(output) > 1
s = GeoSeries.from_xy([0, 1, 2], [0, 1, 3])
def union(s):
return s.union_all()
def total_area(s):
return s.area.sum()
grouped = s.groupby([0, 1, 0])
agg = grouped.agg([total_area, union])
assert_obj_no_active_geo_col(agg, GeoDataFrame, geo_colname=None)
result = grouped.agg([union, total_area])
assert_obj_no_active_geo_col(result, GeoDataFrame, geo_colname=None)
assert_object(grouped.agg([total_area, total_area]), pd.DataFrame)
assert_object(grouped.agg([total_area]), pd.DataFrame)
def test_expanddim_in_unstack():
# https://github.com/geopandas/geopandas/pull/2296#issuecomment-1021966443
s = GeoSeries.from_xy(
[0, 1, 2],
[0, 1, 3],
index=pd.MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("B", "a")]),
)
unstack = s.unstack()
expected_geo_name = None
assert_obj_no_active_geo_col(unstack, GeoDataFrame, geo_colname=expected_geo_name)
# https://github.com/geopandas/geopandas/issues/2486
s.name = "geometry"
unstack = s.unstack()
assert_obj_no_active_geo_col(unstack, GeoDataFrame, expected_geo_name)
# indexing / constructor_sliced tests
test_case_column_sets = [
["geometry"],
["geometry2"],
["geometry", "geometry2"],
# non active geo col case
["geometry", "value"],
["geometry", "value_nan"],
["geometry2", "value"],
["geometry2", "value_nan"],
]
@pytest.mark.parametrize(
"column_set",
test_case_column_sets,
ids=[", ".join(i) for i in test_case_column_sets],
)
def test_constructor_sliced_row_slices(df2, column_set):
# https://github.com/geopandas/geopandas/issues/2282
df_subset = df2[column_set]
assert isinstance(df_subset, GeoDataFrame)
res = df_subset.loc[0]
# row slices shouldn't be GeoSeries, even if they have a geometry col
assert type(res) == pd.Series
if "geometry" in column_set:
assert not isinstance(res.geometry, pd.Series)
assert res.geometry == Point(0, 0)
def test_constructor_sliced_column_slices(df2):
# Note loc doesn't use _constructor_sliced so it's not tested here
geo_idx = df2.columns.get_loc("geometry")
sub = df2.head(1)
# column slices should be GeoSeries if of geometry type
assert type(sub.iloc[:, geo_idx]) == GeoSeries
assert type(sub.iloc[[0], geo_idx]) == GeoSeries
sub = df2.head(2)
assert type(sub.iloc[:, geo_idx]) == GeoSeries
assert type(sub.iloc[[0, 1], geo_idx]) == GeoSeries
# check iloc row slices are pd.Series instead
assert type(df2.iloc[0, :]) == pd.Series
def test_constructor_sliced_in_pandas_methods(df2):
# constructor sliced is used in many places, checking a sample of non
# geometry cases are sensible
assert type(df2.count()) == pd.Series
# drop the secondary geometry columns as not hashable
hashable_test_df = df2.drop(columns=["geometry2", "geometry3"])
assert type(hashable_test_df.duplicated()) == pd.Series
assert type(df2.quantile(numeric_only=True)) == pd.Series
assert type(df2.memory_usage()) == pd.Series
def test_merge_preserve_geodataframe():
# https://github.com/geopandas/geopandas/issues/2932
ser = GeoSeries.from_xy([1], [1])
df = GeoDataFrame({"geo": ser})
res = df.merge(df, left_index=True, right_index=True)
assert_obj_no_active_geo_col(res, GeoDataFrame, geo_colname=None)
expected = GeoDataFrame({"geo_x": ser, "geo_y": ser})
assert_geodataframe_equal(expected, res)
@@ -0,0 +1,891 @@
import os
import numpy as np
import pandas as pd
from shapely import make_valid
from shapely.geometry import GeometryCollection, LineString, Point, Polygon, box
import geopandas
from geopandas import GeoDataFrame, GeoSeries, overlay, read_file
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
try:
from fiona.errors import DriverError
except ImportError:
class DriverError(Exception):
pass
DATA = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", "overlay")
@pytest.fixture
def dfs(request):
s1 = GeoSeries(
[
Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]),
Polygon([(2, 2), (4, 2), (4, 4), (2, 4)]),
]
)
s2 = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
df1 = GeoDataFrame({"col1": [1, 2], "geometry": s1})
df2 = GeoDataFrame({"col2": [1, 2], "geometry": s2})
return df1, df2
@pytest.fixture(params=["default-index", "int-index", "string-index"])
def dfs_index(request, dfs):
df1, df2 = dfs
if request.param == "int-index":
df1.index = [1, 2]
df2.index = [0, 2]
if request.param == "string-index":
df1.index = ["row1", "row2"]
return df1, df2
@pytest.fixture(
params=["union", "intersection", "difference", "symmetric_difference", "identity"]
)
def how(request):
return request.param
@pytest.fixture(params=[True, False])
def keep_geom_type(request):
return request.param
def test_overlay(dfs_index, how):
"""
Basic overlay test with small dummy example dataframes (from docs).
Results obtained using QGIS 2.16 (Vector -> Geoprocessing Tools ->
Intersection / Union / ...), saved to GeoJSON
"""
df1, df2 = dfs_index
result = overlay(df1, df2, how=how)
# construction of result
def _read(name):
expected = read_file(
os.path.join(DATA, "polys", "df1_df2-{0}.geojson".format(name))
)
expected.geometry.array.crs = None
for col in expected.columns[expected.dtypes == "int32"]:
expected[col] = expected[col].astype("int64")
return expected
if how == "identity":
expected_intersection = _read("intersection")
expected_difference = _read("difference")
expected = pd.concat(
[expected_intersection, expected_difference], ignore_index=True, sort=False
)
expected["col1"] = expected["col1"].astype(float)
else:
expected = _read(how)
# TODO needed adaptations to result
if how == "union":
result = result.sort_values(["col1", "col2"]).reset_index(drop=True)
elif how == "difference":
result = result.reset_index(drop=True)
assert_geodataframe_equal(result, expected, check_column_type=False)
# for difference also reversed
if how == "difference":
result = overlay(df2, df1, how=how)
result = result.reset_index(drop=True)
expected = _read("difference-inverse")
assert_geodataframe_equal(result, expected, check_column_type=False)
@pytest.mark.filterwarnings("ignore:GeoSeries crs mismatch:UserWarning")
def test_overlay_nybb(how, nybb_filename):
polydf = read_file(nybb_filename)
# The circles have been constructed and saved at the time the expected
# results were created (exact output of buffer algorithm can slightly
# change over time -> use saved ones)
# # construct circles dataframe
# N = 10
# b = [int(x) for x in polydf.total_bounds]
# polydf2 = GeoDataFrame(
# [
# {"geometry": Point(x, y).buffer(10000), "value1": x + y, "value2": x - y}
# for x, y in zip(
# range(b[0], b[2], int((b[2] - b[0]) / N)),
# range(b[1], b[3], int((b[3] - b[1]) / N)),
# )
# ],
# crs=polydf.crs,
# )
polydf2 = read_file(os.path.join(DATA, "nybb_qgis", "polydf2.shp"))
result = overlay(polydf, polydf2, how=how)
cols = ["BoroCode", "BoroName", "Shape_Leng", "Shape_Area", "value1", "value2"]
if how == "difference":
cols = cols[:-2]
# expected result
if how == "identity":
# read union one, further down below we take the appropriate subset
expected = read_file(os.path.join(DATA, "nybb_qgis", "qgis-union.shp"))
else:
expected = read_file(
os.path.join(DATA, "nybb_qgis", "qgis-{0}.shp".format(how))
)
# The result of QGIS for 'union' contains incorrect geometries:
# 24 is a full original circle overlapping with unioned geometries, and
# 27 is a completely duplicated row)
if how == "union":
expected = expected.drop([24, 27])
expected.reset_index(inplace=True, drop=True)
# Eliminate observations without geometries (issue from QGIS)
expected = expected[expected.is_valid]
expected.reset_index(inplace=True, drop=True)
if how == "identity":
expected = expected[expected.BoroCode.notnull()].copy()
# Order GeoDataFrames
expected = expected.sort_values(cols).reset_index(drop=True)
# TODO needed adaptations to result
result = result.sort_values(cols).reset_index(drop=True)
if how in ("union", "identity"):
# concat < 0.23 sorts, so changes the order of the columns
# but at least we ensure 'geometry' is the last column
assert result.columns[-1] == "geometry"
assert len(result.columns) == len(expected.columns)
result = result.reindex(columns=expected.columns)
# the ordering of the spatial index results causes slight deviations
# in the resultant geometries for multipolygons
# for more details on the discussion, see:
# https://github.com/geopandas/geopandas/pull/1338
# https://github.com/geopandas/geopandas/issues/1337
# Temporary workaround below:
# simplify multipolygon geometry comparison
# since the order of the constituent polygons depends on
# the ordering of spatial indexing results, we cannot
# compare symmetric_difference results directly when the
# resultant geometry is a multipolygon
# first, check that all bounds and areas are approx equal
# this is a very rough check for multipolygon equality
kwargs = {}
pd.testing.assert_series_equal(
result.geometry.area, expected.geometry.area, **kwargs
)
pd.testing.assert_frame_equal(
result.geometry.bounds, expected.geometry.bounds, **kwargs
)
# There are two cases where the multipolygon have a different number
# of sub-geometries -> not solved by normalize (and thus drop for now)
if how == "symmetric_difference":
expected.loc[9, "geometry"] = None
result.loc[9, "geometry"] = None
if how == "union":
expected.loc[24, "geometry"] = None
result.loc[24, "geometry"] = None
# missing values get read as None in read_file for a string column, but
# are introduced as NaN by overlay
expected["BoroName"] = expected["BoroName"].fillna(np.nan)
assert_geodataframe_equal(
result,
expected,
normalize=True,
check_crs=False,
check_column_type=False,
check_less_precise=True,
)
def test_overlay_overlap(how):
"""
Overlay test with overlapping geometries in both dataframes.
Test files are created with::
import geopandas
from geopandas import GeoSeries, GeoDataFrame
from shapely.geometry import Point, Polygon, LineString
s1 = GeoSeries([Point(0, 0), Point(1.5, 0)]).buffer(1, resolution=2)
s2 = GeoSeries([Point(1, 1), Point(2, 2)]).buffer(1, resolution=2)
df1 = GeoDataFrame({'geometry': s1, 'col1':[1,2]})
df2 = GeoDataFrame({'geometry': s2, 'col2':[1, 2]})
ax = df1.plot(alpha=0.5)
df2.plot(alpha=0.5, ax=ax, color='C1')
df1.to_file('geopandas/geopandas/tests/data/df1_overlap.geojson',
driver='GeoJSON')
df2.to_file('geopandas/geopandas/tests/data/df2_overlap.geojson',
driver='GeoJSON')
and then overlay results are obtained from using QGIS 2.16
(Vector -> Geoprocessing Tools -> Intersection / Union / ...),
saved to GeoJSON.
"""
df1 = read_file(os.path.join(DATA, "overlap", "df1_overlap.geojson"))
df2 = read_file(os.path.join(DATA, "overlap", "df2_overlap.geojson"))
result = overlay(df1, df2, how=how)
if how == "identity":
raise pytest.skip()
expected = read_file(
os.path.join(DATA, "overlap", "df1_df2_overlap-{0}.geojson".format(how))
)
if how == "union":
# the QGIS result has the last row duplicated, so removing this
expected = expected.iloc[:-1]
# TODO needed adaptations to result
result = result.reset_index(drop=True)
if how == "union":
result = result.sort_values(["col1", "col2"]).reset_index(drop=True)
assert_geodataframe_equal(
result,
expected,
normalize=True,
check_column_type=False,
check_less_precise=True,
)
@pytest.mark.parametrize("other_geometry", [False, True])
def test_geometry_not_named_geometry(dfs, how, other_geometry):
# Issue #306
# Add points and flip names
df1, df2 = dfs
df3 = df1.copy()
df3 = df3.rename(columns={"geometry": "polygons"})
df3 = df3.set_geometry("polygons")
if other_geometry:
df3["geometry"] = df1.centroid.geometry
assert df3.geometry.name == "polygons"
res1 = overlay(df1, df2, how=how)
res2 = overlay(df3, df2, how=how)
assert df3.geometry.name == "polygons"
if how == "difference":
# in case of 'difference', column names of left frame are preserved
assert res2.geometry.name == "polygons"
if other_geometry:
assert "geometry" in res2.columns
assert_geoseries_equal(
res2["geometry"], df3["geometry"], check_series_type=False
)
res2 = res2.drop(["geometry"], axis=1)
res2 = res2.rename(columns={"polygons": "geometry"})
res2 = res2.set_geometry("geometry")
# TODO if existing column is overwritten -> geometry not last column
if other_geometry and how == "intersection":
res2 = res2.reindex(columns=res1.columns)
assert_geodataframe_equal(res1, res2)
df4 = df2.copy()
df4 = df4.rename(columns={"geometry": "geom"})
df4 = df4.set_geometry("geom")
if other_geometry:
df4["geometry"] = df2.centroid.geometry
assert df4.geometry.name == "geom"
res1 = overlay(df1, df2, how=how)
res2 = overlay(df1, df4, how=how)
assert_geodataframe_equal(res1, res2)
def test_bad_how(dfs):
df1, df2 = dfs
with pytest.raises(ValueError):
overlay(df1, df2, how="spandex")
def test_duplicate_column_name(dfs, how):
if how == "difference":
pytest.skip("Difference uses columns from one df only.")
df1, df2 = dfs
df2r = df2.rename(columns={"col2": "col1"})
res = overlay(df1, df2r, how=how)
assert ("col1_1" in res.columns) and ("col1_2" in res.columns)
def test_geoseries_warning(dfs):
df1, df2 = dfs
# Issue #305
with pytest.raises(NotImplementedError):
overlay(df1, df2.geometry, how="union")
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
def test_preserve_crs(dfs, how):
df1, df2 = dfs
result = overlay(df1, df2, how=how)
assert result.crs is None
crs = "epsg:4326"
df1.crs = crs
df2.crs = crs
result = overlay(df1, df2, how=how)
assert result.crs == crs
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
def test_crs_mismatch(dfs, how):
df1, df2 = dfs
df1.crs = 4326
df2.crs = 3857
with pytest.warns(UserWarning, match="CRS mismatch between the CRS"):
overlay(df1, df2, how=how)
def test_empty_intersection(dfs):
df1, df2 = dfs
polys3 = GeoSeries(
[
Polygon([(-1, -1), (-3, -1), (-3, -3), (-1, -3)]),
Polygon([(-3, -3), (-5, -3), (-5, -5), (-3, -5)]),
]
)
df3 = GeoDataFrame({"geometry": polys3, "col3": [1, 2]})
expected = GeoDataFrame([], columns=["col1", "col3", "geometry"])
result = overlay(df1, df3)
assert_geodataframe_equal(result, expected, check_dtype=False)
def test_correct_index(dfs):
# GH883 - case where the index was not properly reset
df1, df2 = dfs
polys3 = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(-1, 1), (1, 1), (1, 3), (-1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
df3 = GeoDataFrame({"geometry": polys3, "col3": [1, 2, 3]})
i1 = Polygon([(1, 1), (1, 3), (3, 3), (3, 1), (1, 1)])
i2 = Polygon([(3, 3), (3, 5), (5, 5), (5, 3), (3, 3)])
expected = GeoDataFrame(
[[1, 1, i1], [3, 2, i2]], columns=["col3", "col2", "geometry"]
)
result = overlay(df3, df2, keep_geom_type=True)
assert_geodataframe_equal(result, expected)
def test_warn_on_keep_geom_type(dfs):
df1, df2 = dfs
polys3 = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(-1, 1), (1, 1), (1, 3), (-1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
df3 = GeoDataFrame({"geometry": polys3})
with pytest.warns(UserWarning, match="`keep_geom_type=True` in overlay"):
overlay(df2, df3, keep_geom_type=None)
@pytest.mark.parametrize(
"geom_types", ["polys", "poly_line", "poly_point", "line_poly", "point_poly"]
)
def test_overlay_strict(how, keep_geom_type, geom_types):
"""
Test of mixed geometry types on input and output. Expected results initially
generated using following snippet.
polys1 = gpd.GeoSeries([Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)])])
df1 = gpd.GeoDataFrame({'col1': [1, 2], 'geometry': polys1})
polys2 = gpd.GeoSeries([Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(-1, 1), (1, 1), (1, 3), (-1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)])])
df2 = gpd.GeoDataFrame({'geometry': polys2, 'col2': [1, 2, 3]})
lines1 = gpd.GeoSeries([LineString([(2, 0), (2, 4), (6, 4)]),
LineString([(0, 3), (6, 3)])])
df3 = gpd.GeoDataFrame({'col3': [1, 2], 'geometry': lines1})
points1 = gpd.GeoSeries([Point((2, 2)),
Point((3, 3))])
df4 = gpd.GeoDataFrame({'col4': [1, 2], 'geometry': points1})
params=["union", "intersection", "difference", "symmetric_difference",
"identity"]
stricts = [True, False]
for p in params:
for s in stricts:
exp = gpd.overlay(df1, df2, how=p, keep_geom_type=s)
if not exp.empty:
exp.to_file('polys_{p}_{s}.geojson'.format(p=p, s=s),
driver='GeoJSON')
for p in params:
for s in stricts:
exp = gpd.overlay(df1, df3, how=p, keep_geom_type=s)
if not exp.empty:
exp.to_file('poly_line_{p}_{s}.geojson'.format(p=p, s=s),
driver='GeoJSON')
for p in params:
for s in stricts:
exp = gpd.overlay(df1, df4, how=p, keep_geom_type=s)
if not exp.empty:
exp.to_file('poly_point_{p}_{s}.geojson'.format(p=p, s=s),
driver='GeoJSON')
"""
polys1 = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
df1 = GeoDataFrame({"col1": [1, 2], "geometry": polys1})
polys2 = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(-1, 1), (1, 1), (1, 3), (-1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
df2 = GeoDataFrame({"geometry": polys2, "col2": [1, 2, 3]})
lines1 = GeoSeries(
[LineString([(2, 0), (2, 4), (6, 4)]), LineString([(0, 3), (6, 3)])]
)
df3 = GeoDataFrame({"col3": [1, 2], "geometry": lines1})
points1 = GeoSeries([Point((2, 2)), Point((3, 3))])
df4 = GeoDataFrame({"col4": [1, 2], "geometry": points1})
if geom_types == "polys":
result = overlay(df1, df2, how=how, keep_geom_type=keep_geom_type)
elif geom_types == "poly_line":
result = overlay(df1, df3, how=how, keep_geom_type=keep_geom_type)
elif geom_types == "poly_point":
result = overlay(df1, df4, how=how, keep_geom_type=keep_geom_type)
elif geom_types == "line_poly":
result = overlay(df3, df1, how=how, keep_geom_type=keep_geom_type)
elif geom_types == "point_poly":
result = overlay(df4, df1, how=how, keep_geom_type=keep_geom_type)
try:
expected = read_file(
os.path.join(
DATA,
"strict",
"{t}_{h}_{s}.geojson".format(t=geom_types, h=how, s=keep_geom_type),
)
)
# the order depends on the spatial index used
# so we sort the resultant dataframes to get a consistent order
# independently of the spatial index implementation
assert all(expected.columns == result.columns), "Column name mismatch"
cols = list(set(result.columns) - {"geometry"})
expected = expected.sort_values(cols, axis=0).reset_index(drop=True)
result = result.sort_values(cols, axis=0).reset_index(drop=True)
# some columns are all-NaN in the result, but get read as object dtype
# column of None values in read_file
for col in ["col1", "col3", "col4"]:
if col in expected.columns and expected[col].isna().all():
expected[col] = expected[col].astype("float64")
assert_geodataframe_equal(
result,
expected,
normalize=True,
check_column_type=False,
check_less_precise=True,
check_crs=False,
check_dtype=False,
)
except DriverError: # fiona >= 1.8
assert result.empty
except OSError: # fiona < 1.8
assert result.empty
except RuntimeError: # pyogrio.DataSourceError
assert result.empty
def test_mixed_geom_error():
polys1 = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
df1 = GeoDataFrame({"col1": [1, 2], "geometry": polys1})
mixed = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
LineString([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
dfmixed = GeoDataFrame({"col1": [1, 2], "geometry": mixed})
with pytest.raises(NotImplementedError):
overlay(df1, dfmixed, keep_geom_type=True)
def test_keep_geom_type_error():
gcol = GeoSeries(
GeometryCollection(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
LineString([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
)
dfcol = GeoDataFrame({"col1": [2], "geometry": gcol})
polys1 = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
df1 = GeoDataFrame({"col1": [1, 2], "geometry": polys1})
with pytest.raises(TypeError):
overlay(dfcol, df1, keep_geom_type=True)
def test_keep_geom_type_geometry_collection():
# GH 1581
df1 = read_file(os.path.join(DATA, "geom_type", "df1.geojson"))
df2 = read_file(os.path.join(DATA, "geom_type", "df2.geojson"))
with pytest.warns(UserWarning, match="`keep_geom_type=True` in overlay"):
intersection = overlay(df1, df2, keep_geom_type=None)
assert len(intersection) == 1
assert (intersection.geom_type == "Polygon").all()
intersection = overlay(df1, df2, keep_geom_type=True)
assert len(intersection) == 1
assert (intersection.geom_type == "Polygon").all()
intersection = overlay(df1, df2, keep_geom_type=False)
assert len(intersection) == 1
assert (intersection.geom_type == "GeometryCollection").all()
def test_keep_geom_type_geometry_collection2():
polys1 = [
box(0, 0, 1, 1),
box(1, 1, 3, 3).union(box(1, 3, 5, 5)),
]
polys2 = [
box(0, 0, 1, 1),
box(3, 1, 4, 2).union(box(4, 1, 5, 4)),
]
df1 = GeoDataFrame({"left": [0, 1], "geometry": polys1})
df2 = GeoDataFrame({"right": [0, 1], "geometry": polys2})
result1 = overlay(df1, df2, keep_geom_type=True)
expected1 = GeoDataFrame(
{
"left": [0, 1],
"right": [0, 1],
"geometry": [box(0, 0, 1, 1), box(4, 3, 5, 4)],
}
)
assert_geodataframe_equal(result1, expected1)
result1 = overlay(df1, df2, keep_geom_type=False)
expected1 = GeoDataFrame(
{
"left": [0, 1, 1],
"right": [0, 0, 1],
"geometry": [
box(0, 0, 1, 1),
Point(1, 1),
GeometryCollection([box(4, 3, 5, 4), LineString([(3, 1), (3, 2)])]),
],
}
)
assert_geodataframe_equal(result1, expected1)
def test_keep_geom_type_geomcoll_different_types():
polys1 = [box(0, 1, 1, 3), box(10, 10, 12, 12)]
polys2 = [
Polygon([(1, 0), (3, 0), (3, 3), (1, 3), (1, 2), (2, 2), (2, 1), (1, 1)]),
box(11, 11, 13, 13),
]
df1 = GeoDataFrame({"left": [0, 1], "geometry": polys1})
df2 = GeoDataFrame({"right": [0, 1], "geometry": polys2})
result1 = overlay(df1, df2, keep_geom_type=True)
expected1 = GeoDataFrame(
{
"left": [1],
"right": [1],
"geometry": [box(11, 11, 12, 12)],
}
)
assert_geodataframe_equal(result1, expected1)
result2 = overlay(df1, df2, keep_geom_type=False)
expected2 = GeoDataFrame(
{
"left": [0, 1],
"right": [0, 1],
"geometry": [
GeometryCollection([LineString([(1, 2), (1, 3)]), Point(1, 1)]),
box(11, 11, 12, 12),
],
}
)
assert_geodataframe_equal(result2, expected2)
def test_keep_geom_type_geometry_collection_difference():
# GH 2163
polys1 = [
box(0, 0, 1, 1),
box(1, 1, 2, 2),
]
# the tiny sliver in the second geometry may be converted to a
# linestring during the overlay process due to floating point errors
# on some platforms
polys2 = [
box(0, 0, 1, 1),
box(1, 1, 2, 3).union(box(2, 2, 3, 2.00000000000000001)),
]
df1 = GeoDataFrame({"left": [0, 1], "geometry": polys1})
df2 = GeoDataFrame({"right": [0, 1], "geometry": polys2})
result1 = overlay(df2, df1, keep_geom_type=True, how="difference")
expected1 = GeoDataFrame(
{
"right": [1],
"geometry": [box(1, 2, 2, 3)],
},
)
assert_geodataframe_equal(result1, expected1)
@pytest.mark.parametrize("should_make_valid", [True, False])
def test_overlap_make_valid(should_make_valid):
bowtie = Polygon([(1, 1), (9, 9), (9, 1), (1, 9), (1, 1)])
assert not bowtie.is_valid
fixed_bowtie = make_valid(bowtie)
assert fixed_bowtie.is_valid
df1 = GeoDataFrame({"col1": ["region"], "geometry": GeoSeries([box(0, 0, 10, 10)])})
df_bowtie = GeoDataFrame(
{"col1": ["invalid", "valid"], "geometry": GeoSeries([bowtie, fixed_bowtie])}
)
if should_make_valid:
df_overlay_bowtie = overlay(df1, df_bowtie, make_valid=should_make_valid)
assert df_overlay_bowtie.at[0, "geometry"].equals(fixed_bowtie)
assert df_overlay_bowtie.at[1, "geometry"].equals(fixed_bowtie)
else:
with pytest.raises(ValueError, match="1 invalid input geometries"):
overlay(df1, df_bowtie, make_valid=should_make_valid)
def test_empty_overlay_return_non_duplicated_columns(nybb_filename):
nybb = geopandas.read_file(nybb_filename)
nybb2 = nybb.copy()
nybb2.geometry = nybb2.translate(20000000)
result = geopandas.overlay(nybb, nybb2)
expected = GeoDataFrame(
columns=[
"BoroCode_1",
"BoroName_1",
"Shape_Leng_1",
"Shape_Area_1",
"BoroCode_2",
"BoroName_2",
"Shape_Leng_2",
"Shape_Area_2",
"geometry",
],
crs=nybb.crs,
)
assert_geodataframe_equal(result, expected, check_dtype=False)
def test_non_overlapping(how):
p1 = Polygon([(0, 0), (2, 0), (2, 2), (0, 2)])
p2 = Polygon([(3, 3), (5, 3), (5, 5), (3, 5)])
df1 = GeoDataFrame({"col1": [1], "geometry": [p1]})
df2 = GeoDataFrame({"col2": [2], "geometry": [p2]})
result = overlay(df1, df2, how=how)
if how == "intersection":
if PANDAS_GE_20:
index = None
else:
index = pd.Index([], dtype="object")
expected = GeoDataFrame(
{
"col1": np.array([], dtype="int64"),
"col2": np.array([], dtype="int64"),
"geometry": [],
},
index=index,
)
elif how == "union":
expected = GeoDataFrame(
{
"col1": [1, np.nan],
"col2": [np.nan, 2],
"geometry": [p1, p2],
}
)
elif how == "identity":
expected = GeoDataFrame(
{
"col1": [1.0],
"col2": [np.nan],
"geometry": [p1],
}
)
elif how == "symmetric_difference":
expected = GeoDataFrame(
{
"col1": [1, np.nan],
"col2": [np.nan, 2],
"geometry": [p1, p2],
}
)
elif how == "difference":
expected = GeoDataFrame(
{
"col1": [1],
"geometry": [p1],
}
)
assert_geodataframe_equal(result, expected)
def test_no_intersection():
# overlapping bounds but non-overlapping geometries
gs = GeoSeries([Point(x, x).buffer(0.1) for x in range(3)])
gdf1 = GeoDataFrame({"foo": ["a", "b", "c"]}, geometry=gs)
gdf2 = GeoDataFrame({"bar": ["1", "3", "5"]}, geometry=gs.translate(1))
expected = GeoDataFrame(columns=["foo", "bar", "geometry"])
result = overlay(gdf1, gdf2, how="intersection")
assert_geodataframe_equal(result, expected, check_index_type=False)
class TestOverlayWikiExample:
def setup_method(self):
self.layer_a = GeoDataFrame(geometry=[box(0, 2, 6, 6)])
self.layer_b = GeoDataFrame(geometry=[box(4, 0, 10, 4)])
self.intersection = GeoDataFrame(geometry=[box(4, 2, 6, 4)])
self.union = GeoDataFrame(
geometry=[
box(4, 2, 6, 4),
Polygon([(4, 2), (0, 2), (0, 6), (6, 6), (6, 4), (4, 4), (4, 2)]),
Polygon([(10, 0), (4, 0), (4, 2), (6, 2), (6, 4), (10, 4), (10, 0)]),
]
)
self.a_difference_b = GeoDataFrame(
geometry=[Polygon([(4, 2), (0, 2), (0, 6), (6, 6), (6, 4), (4, 4), (4, 2)])]
)
self.b_difference_a = GeoDataFrame(
geometry=[
Polygon([(10, 0), (4, 0), (4, 2), (6, 2), (6, 4), (10, 4), (10, 0)])
]
)
self.symmetric_difference = GeoDataFrame(
geometry=[
Polygon([(4, 2), (0, 2), (0, 6), (6, 6), (6, 4), (4, 4), (4, 2)]),
Polygon([(10, 0), (4, 0), (4, 2), (6, 2), (6, 4), (10, 4), (10, 0)]),
]
)
self.a_identity_b = GeoDataFrame(
geometry=[
box(4, 2, 6, 4),
Polygon([(4, 2), (0, 2), (0, 6), (6, 6), (6, 4), (4, 4), (4, 2)]),
]
)
self.b_identity_a = GeoDataFrame(
geometry=[
box(4, 2, 6, 4),
Polygon([(10, 0), (4, 0), (4, 2), (6, 2), (6, 4), (10, 4), (10, 0)]),
]
)
def test_intersection(self):
df_result = overlay(self.layer_a, self.layer_b, how="intersection")
assert df_result.geom_equals(self.intersection).all()
def test_union(self):
df_result = overlay(self.layer_a, self.layer_b, how="union")
assert_geodataframe_equal(df_result, self.union)
def test_a_difference_b(self):
df_result = overlay(self.layer_a, self.layer_b, how="difference")
assert_geodataframe_equal(df_result, self.a_difference_b)
def test_b_difference_a(self):
df_result = overlay(self.layer_b, self.layer_a, how="difference")
assert_geodataframe_equal(df_result, self.b_difference_a)
def test_symmetric_difference(self):
df_result = overlay(self.layer_a, self.layer_b, how="symmetric_difference")
assert_geodataframe_equal(df_result, self.symmetric_difference)
def test_a_identity_b(self):
df_result = overlay(self.layer_a, self.layer_b, how="identity")
assert_geodataframe_equal(df_result, self.a_identity_b)
def test_b_identity_a(self):
df_result = overlay(self.layer_b, self.layer_a, how="identity")
assert_geodataframe_equal(df_result, self.b_identity_a)
@@ -0,0 +1,890 @@
import os
import warnings
from packaging.version import Version
import numpy as np
import pandas as pd
import shapely
from shapely.geometry import GeometryCollection, LinearRing, LineString, Point
import geopandas
import geopandas._compat as compat
from geopandas import GeoDataFrame, GeoSeries
from geopandas.array import from_shapely
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from numpy.testing import assert_array_equal
from pandas.testing import assert_frame_equal, assert_series_equal
@pytest.fixture
def s():
return GeoSeries([Point(x, y) for x, y in zip(range(3), range(3))])
@pytest.fixture
def df():
return GeoDataFrame(
{
"geometry": [Point(x, x) for x in range(3)],
"value1": np.arange(3, dtype="int64"),
"value2": np.array([1, 2, 1], dtype="int64"),
}
)
def test_repr(s, df):
assert "POINT" in repr(s)
assert "POINT" in repr(df)
assert "POINT" in df._repr_html_()
@pytest.mark.skipif(shapely.geos_version < (3, 9, 0), reason="requires GEOS>=3.9")
def test_repr_boxed_display_precision():
# geographic coordinates
p1 = Point(10.123456789, 50.123456789)
p2 = Point(4.123456789, 20.123456789)
s1 = GeoSeries([p1, p2, None])
assert "POINT (10.12346 50.12346)" in repr(s1)
# geographic coordinates 4326
s3 = GeoSeries([p1, p2], crs=4326)
assert "POINT (10.12346 50.12346)" in repr(s3)
# projected coordinates
p1 = Point(3000.123456789, 3000.123456789)
p2 = Point(4000.123456789, 4000.123456789)
s2 = GeoSeries([p1, p2, None])
assert "POINT (3000.123 3000.123)" in repr(s2)
# projected geographic coordinate
s4 = GeoSeries([p1, p2], crs=3857)
assert "POINT (3000.123 3000.123)" in repr(s4)
geopandas.options.display_precision = 1
assert "POINT (10.1 50.1)" in repr(s1)
geopandas.options.display_precision = 9
assert "POINT (10.123456789 50.123456789)" in repr(s1)
def test_repr_all_missing():
# https://github.com/geopandas/geopandas/issues/1195
s = GeoSeries([None, None, None])
assert "None" in repr(s)
df = GeoDataFrame({"a": [1, 2, 3], "geometry": s})
assert "None" in repr(df)
assert "geometry" in df._repr_html_()
def test_repr_empty():
# https://github.com/geopandas/geopandas/issues/1195
s = GeoSeries([])
assert repr(s) == "GeoSeries([], dtype: geometry)"
df = GeoDataFrame({"a": [], "geometry": s})
assert "Empty GeoDataFrame" in repr(df)
# https://github.com/geopandas/geopandas/issues/1184
assert "geometry" in df._repr_html_()
def test_repr_linearring():
# https://github.com/geopandas/geopandas/pull/2689
# specifically, checking internal shapely/wkt/wkb conversions
# preserve LinearRing
s = GeoSeries([LinearRing([(0, 0), (1, 1), (1, -1)])])
assert "LINEARRING" in str(s.iloc[0]) # shapely scalar repr
assert "LINEARRING" in str(s) # GeoSeries repr
# check something coercible to linearring is not converted
s2 = GeoSeries(
[
LineString([(0, 0), (1, 1), (1, -1)]),
LineString([(0, 0), (1, 1), (1, -1), (0, 0)]),
]
)
assert "LINEARRING" not in str(s2)
def test_indexing(s, df):
# accessing scalar from the geometry (column)
exp = Point(1, 1)
assert s[1] == exp
assert s.loc[1] == exp
assert s.iloc[1] == exp
assert df.loc[1, "geometry"] == exp
assert df.iloc[1, 0] == exp
# multiple values
exp = GeoSeries([Point(2, 2), Point(0, 0)], index=[2, 0])
assert_geoseries_equal(s.loc[[2, 0]], exp)
assert_geoseries_equal(s.iloc[[2, 0]], exp)
assert_geoseries_equal(s.reindex([2, 0]), exp)
assert_geoseries_equal(df.loc[[2, 0], "geometry"], exp)
# TODO here iloc does not return a GeoSeries
assert_series_equal(
df.iloc[[2, 0], 0], exp, check_series_type=False, check_names=False
)
# boolean indexing
exp = GeoSeries([Point(0, 0), Point(2, 2)], index=[0, 2])
mask = np.array([True, False, True])
assert_geoseries_equal(s[mask], exp)
assert_geoseries_equal(s.loc[mask], exp)
assert_geoseries_equal(df[mask]["geometry"], exp)
assert_geoseries_equal(df.loc[mask, "geometry"], exp)
# slices
s.index = [1, 2, 3]
exp = GeoSeries([Point(1, 1), Point(2, 2)], index=[2, 3])
assert_series_equal(s[1:], exp)
assert_series_equal(s.iloc[1:], exp)
assert_series_equal(s.loc[2:], exp)
def test_reindex(s, df):
# GeoSeries reindex
res = s.reindex([1, 2, 3])
exp = GeoSeries([Point(1, 1), Point(2, 2), None], index=[1, 2, 3])
assert_geoseries_equal(res, exp)
# GeoDataFrame reindex index
res = df.reindex(index=[1, 2, 3])
assert_geoseries_equal(res.geometry, exp)
# GeoDataFrame reindex columns
res = df.reindex(columns=["value1", "geometry"])
assert isinstance(res, GeoDataFrame)
assert isinstance(res.geometry, GeoSeries)
assert_frame_equal(res, df[["value1", "geometry"]])
res = df.reindex(columns=["value1", "value2"])
assert type(res) == pd.DataFrame
assert_frame_equal(res, df[["value1", "value2"]])
def test_take(s, df):
inds = np.array([0, 2])
# GeoSeries take
result = s.take(inds)
expected = s.iloc[[0, 2]]
assert isinstance(result, GeoSeries)
assert_geoseries_equal(result, expected)
# GeoDataFrame take axis 0
result = df.take(inds, axis=0)
expected = df.iloc[[0, 2], :]
assert isinstance(result, GeoDataFrame)
assert_geodataframe_equal(result, expected)
# GeoDataFrame take axis 1
df = df.reindex(columns=["value1", "value2", "geometry"]) # ensure consistent order
result = df.take(inds, axis=1)
expected = df[["value1", "geometry"]]
assert isinstance(result, GeoDataFrame)
assert_geodataframe_equal(result, expected)
result = df.take(np.array([0, 1]), axis=1)
expected = df[["value1", "value2"]]
assert isinstance(result, pd.DataFrame)
assert_frame_equal(result, expected)
def test_take_empty(s, df):
# ensure that index type is preserved in an empty take
# https://github.com/geopandas/geopandas/issues/1190
inds = np.array([], dtype="int64")
# use non-default index
df.index = pd.date_range("2012-01-01", periods=len(df))
result = df.take(inds, axis=0)
assert isinstance(result, GeoDataFrame)
assert result.shape == (0, 3)
assert isinstance(result.index, pd.DatetimeIndex)
# the original bug report was an empty boolean mask
for result in [df.loc[df["value1"] > 100], df[df["value1"] > 100]]:
assert isinstance(result, GeoDataFrame)
assert result.shape == (0, 3)
assert isinstance(result.index, pd.DatetimeIndex)
def test_assignment(s, df):
exp = GeoSeries([Point(10, 10), Point(1, 1), Point(2, 2)])
s2 = s.copy()
s2[0] = Point(10, 10)
assert_geoseries_equal(s2, exp)
s2 = s.copy()
s2.loc[0] = Point(10, 10)
assert_geoseries_equal(s2, exp)
s2 = s.copy()
s2.iloc[0] = Point(10, 10)
assert_geoseries_equal(s2, exp)
df2 = df.copy()
df2.loc[0, "geometry"] = Point(10, 10)
assert_geoseries_equal(df2["geometry"], exp)
df2 = df.copy()
df2.iloc[0, 0] = Point(10, 10)
assert_geoseries_equal(df2["geometry"], exp)
def test_assign(df):
res = df.assign(new=1)
exp = df.copy()
exp["new"] = 1
assert isinstance(res, GeoDataFrame)
assert_frame_equal(res, exp)
def test_astype(s, df):
# check geoseries functionality
with pytest.raises(TypeError):
s.astype(int)
assert s.astype(str)[0] == "POINT (0 0)"
res = s.astype(object)
if not (
(Version(pd.__version__) == Version("2.1.0"))
or (Version(pd.__version__) == Version("2.1.1"))
):
# https://github.com/geopandas/geopandas/issues/2948 - bug in pandas 2.1.0
assert isinstance(res, pd.Series) and not isinstance(res, GeoSeries)
assert res.dtype == object
df = df.rename_geometry("geom_list")
# check whether returned object is a geodataframe
res = df.astype({"value1": float})
assert isinstance(res, GeoDataFrame)
# check whether returned object is a dataframe
res = df.astype(str)
assert isinstance(res, pd.DataFrame) and not isinstance(res, GeoDataFrame)
res = df.astype({"geom_list": str})
assert isinstance(res, pd.DataFrame) and not isinstance(res, GeoDataFrame)
res = df.astype(object)
assert isinstance(res, pd.DataFrame) and not isinstance(res, GeoDataFrame)
assert res["geom_list"].dtype == object
def test_astype_invalid_geodataframe():
# https://github.com/geopandas/geopandas/issues/1144
# a GeoDataFrame without geometry column should not error in astype
df = GeoDataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
res = df.astype(object)
assert isinstance(res, pd.DataFrame) and not isinstance(res, GeoDataFrame)
assert res["a"].dtype == object
def test_convert_dtypes(df):
# https://github.com/geopandas/geopandas/issues/1870
# Test geometry col is first col, first, geom_col_name=geometry
# (order is important in concat, used internally)
res1 = df.convert_dtypes()
expected1 = GeoDataFrame(
pd.DataFrame(df).convert_dtypes(), crs=df.crs, geometry=df.geometry.name
)
# Checking type and metadata are right
assert_geodataframe_equal(expected1, res1)
# Test geom last, geom_col_name=geometry
res2 = df[["value1", "value2", "geometry"]].convert_dtypes()
assert_geodataframe_equal(expected1[["value1", "value2", "geometry"]], res2)
if compat.HAS_PYPROJ:
# Test again with crs set and custom geom col name
df2 = df.set_crs(epsg=4326).rename_geometry("points")
expected2 = GeoDataFrame(
pd.DataFrame(df2).convert_dtypes(), crs=df2.crs, geometry=df2.geometry.name
)
res3 = df2.convert_dtypes()
assert_geodataframe_equal(expected2, res3)
# Test geom last, geom_col=geometry
res4 = df2[["value1", "value2", "points"]].convert_dtypes()
assert_geodataframe_equal(expected2[["value1", "value2", "points"]], res4)
def test_to_csv(df):
exp = (
"geometry,value1,value2\nPOINT (0 0),0,1\nPOINT (1 1),1,2\nPOINT (2 2),2,1\n"
).replace("\n", os.linesep)
assert df.to_csv(index=False) == exp
@pytest.mark.filterwarnings(
"ignore:Dropping of nuisance columns in DataFrame reductions"
)
def test_numerical_operations(s, df):
# df methods ignore the geometry column
exp = pd.Series([3, 4], index=["value1", "value2"])
if not compat.PANDAS_GE_20:
res = df.sum()
else:
res = df.sum(numeric_only=True)
assert_series_equal(res, exp)
# series methods raise error (not supported for geometry)
with pytest.raises(TypeError):
s.sum()
with pytest.raises(TypeError):
s.max()
with pytest.raises((TypeError, ValueError)):
# TODO: remove ValueError after pandas-dev/pandas#32749
s.idxmax()
# numerical ops raise an error
with pytest.raises(TypeError):
df + 1
with pytest.raises(TypeError):
s + 1
# boolean comparisons work
res = df == 100
exp = pd.DataFrame(False, index=df.index, columns=df.columns)
assert_frame_equal(res, exp)
def test_where(s):
res = s.where(np.array([True, False, True]))
exp = GeoSeries([Point(0, 0), None, Point(2, 2)])
assert_series_equal(res, exp)
def test_select_dtypes(df):
res = df.select_dtypes(include=[np.number])
exp = df[["value1", "value2"]]
assert_frame_equal(res, exp)
def test_equals(s, df):
# https://github.com/geopandas/geopandas/issues/1420
s2 = s.copy()
assert s.equals(s2) is True
s2.iloc[0] = None
assert s.equals(s2) is False
df2 = df.copy()
assert df.equals(df2) is True
df2.loc[0, "geometry"] = Point(10, 10)
assert df.equals(df2) is False
df2 = df.copy()
df2.loc[0, "value1"] = 10
assert df.equals(df2) is False
# Missing values
def test_fillna_scalar(s, df):
s2 = GeoSeries([Point(0, 0), None, Point(2, 2)])
res = s2.fillna(Point(1, 1))
assert_geoseries_equal(res, s)
# allow np.nan although this does not change anything
# https://github.com/geopandas/geopandas/issues/1149
res = s2.fillna(np.nan)
assert_geoseries_equal(res, s2)
# raise exception if trying to fill missing geometry w/ non-geometry
df2 = df.copy()
df2["geometry"] = s2
res = df2.fillna(Point(1, 1))
assert_geodataframe_equal(res, df)
with pytest.raises((NotImplementedError, TypeError)): # GH2351
df2.fillna(0)
# allow non-geometry fill value if there are no missing values
# https://github.com/geopandas/geopandas/issues/1149
df3 = df.copy()
df3.loc[0, "value1"] = np.nan
res = df3.fillna(0)
assert_geodataframe_equal(res.astype({"value1": "int64"}), df)
def test_fillna_series(s):
# fill na with another GeoSeries
s2 = GeoSeries([Point(0, 0), None, Point(2, 2)])
# check na filled with the same index
res = s2.fillna(GeoSeries([Point(1, 1)] * 3))
assert_geoseries_equal(res, s)
# check na filled based on index, not position
index = [3, 2, 1]
res = s2.fillna(GeoSeries([Point(i, i) for i in index], index=index))
assert_geoseries_equal(res, s)
# check na filled but the input length is different
res = s2.fillna(GeoSeries([Point(1, 1)], index=[1]))
assert_geoseries_equal(res, s)
# check na filled but the inputting index is different
res = s2.fillna(GeoSeries([Point(1, 1)], index=[9]))
assert_geoseries_equal(res, s2)
def test_fillna_inplace(s):
s2 = GeoSeries([Point(0, 0), None, Point(2, 2)])
arr = s2.array
s2.fillna(Point(1, 1), inplace=True)
assert_geoseries_equal(s2, s)
if compat.PANDAS_GE_21:
# starting from pandas 2.1, there is support to do this actually inplace
assert s2.array is arr
def test_dropna():
s2 = GeoSeries([Point(0, 0), None, Point(2, 2)])
res = s2.dropna()
exp = s2.loc[[0, 2]]
assert_geoseries_equal(res, exp)
@pytest.mark.parametrize("NA", [None, np.nan])
def test_isna(NA):
s2 = GeoSeries([Point(0, 0), NA, Point(2, 2)], index=[2, 4, 5], name="tt")
exp = pd.Series([False, True, False], index=[2, 4, 5], name="tt")
res = s2.isnull()
assert type(res) == pd.Series
assert_series_equal(res, exp)
res = s2.isna()
assert_series_equal(res, exp)
res = s2.notnull()
assert_series_equal(res, ~exp)
res = s2.notna()
assert_series_equal(res, ~exp)
# Any / all
def test_any_all():
empty = GeometryCollection([])
s = GeoSeries([empty, Point(1, 1)])
assert not s.all()
assert s.any()
s = GeoSeries([Point(1, 1), Point(1, 1)])
assert s.all()
assert s.any()
s = GeoSeries([empty, empty])
assert not s.all()
assert not s.any()
# Groupby / algos
def test_sort_values():
s = GeoSeries([Point(0, 0), Point(2, 2), Point(0, 2)])
res = s.sort_values()
assert res.index.tolist() == [0, 2, 1]
res2 = s.sort_values(ascending=False)
assert res2.index.tolist() == [1, 2, 0]
# empty geoseries
assert_geoseries_equal(s.iloc[:0].sort_values(), s.iloc[:0])
def test_sort_values_empty_missing():
s = GeoSeries([Point(0, 0), None, Point(), Point(1, 1)])
# default: NA sorts last, empty first
res = s.sort_values()
assert res.index.tolist() == [2, 0, 3, 1]
# descending: NA sorts last, empty last
res = s.sort_values(ascending=False)
assert res.index.tolist() == [3, 0, 2, 1]
# NAs first, empty first after NAs
res = s.sort_values(na_position="first")
assert res.index.tolist() == [1, 2, 0, 3]
# NAs first, descending with empyt last
res = s.sort_values(ascending=False, na_position="first")
assert res.index.tolist() == [1, 3, 0, 2]
# all missing / empty
s = GeoSeries([None, None, None])
res = s.sort_values()
assert res.index.tolist() == [0, 1, 2]
s = GeoSeries([Point(), Point(), Point()])
res = s.sort_values()
assert res.index.tolist() == [0, 1, 2]
s = GeoSeries([Point(), None, Point()])
res = s.sort_values()
assert res.index.tolist() == [0, 2, 1]
def test_unique():
s = GeoSeries([Point(0, 0), Point(0, 0), Point(2, 2)])
exp = from_shapely([Point(0, 0), Point(2, 2)])
# TODO should have specialized GeometryArray assert method
assert_array_equal(s.unique(), exp)
def pd14_compat_index(index):
if compat.PANDAS_GE_14:
return from_shapely(index)
else:
return index
def test_value_counts():
# each object is considered unique
s = GeoSeries([Point(0, 0), Point(1, 1), Point(0, 0)])
res = s.value_counts()
if compat.PANDAS_GE_20:
name = "count"
else:
name = None
exp = pd.Series(
[2, 1], index=pd14_compat_index([Point(0, 0), Point(1, 1)]), name=name
)
assert_series_equal(res, exp)
# Check crs doesn't make a difference - note it is not kept in output index anyway
s2 = GeoSeries([Point(0, 0), Point(1, 1), Point(0, 0)], crs="EPSG:4326")
res2 = s2.value_counts()
assert_series_equal(res2, exp)
if compat.PANDAS_GE_14:
# TODO should/ can we fix CRS being lost
assert s2.value_counts().index.array.crs is None
# check mixed geometry
s3 = GeoSeries([Point(0, 0), LineString([[1, 1], [2, 2]]), Point(0, 0)])
res3 = s3.value_counts()
index = pd14_compat_index([Point(0, 0), LineString([[1, 1], [2, 2]])])
exp3 = pd.Series([2, 1], index=index, name=name)
assert_series_equal(res3, exp3)
# check None is handled
s4 = GeoSeries([Point(0, 0), None, Point(0, 0)])
res4 = s4.value_counts(dropna=True)
exp4_dropna = pd.Series([2], index=pd14_compat_index([Point(0, 0)]), name=name)
assert_series_equal(res4, exp4_dropna)
exp4_keepna = pd.Series(
[2, 1], index=pd14_compat_index([Point(0, 0), None]), name=name
)
res4_keepna = s4.value_counts(dropna=False)
assert_series_equal(res4_keepna, exp4_keepna)
@pytest.mark.xfail(strict=False)
def test_drop_duplicates_series():
# duplicated does not yet use EA machinery
# (https://github.com/pandas-dev/pandas/issues/27264)
# but relies on unstable hashing of unhashable objects in numpy array
# giving flaky test (https://github.com/pandas-dev/pandas/issues/27035)
dups = GeoSeries([Point(0, 0), Point(0, 0)])
dropped = dups.drop_duplicates()
assert len(dropped) == 1
@pytest.mark.xfail(strict=False)
def test_drop_duplicates_frame():
# duplicated does not yet use EA machinery, see above
gdf_len = 3
dup_gdf = GeoDataFrame(
{"geometry": [Point(0, 0) for _ in range(gdf_len)], "value1": range(gdf_len)}
)
dropped_geometry = dup_gdf.drop_duplicates(subset="geometry")
assert len(dropped_geometry) == 1
dropped_all = dup_gdf.drop_duplicates()
assert len(dropped_all) == gdf_len
def test_groupby(df):
# counts work fine
res = df.groupby("value2").count()
exp = pd.DataFrame(
{"geometry": [2, 1], "value1": [2, 1], "value2": [1, 2]}
).set_index("value2")
assert_frame_equal(res, exp)
# reductions ignore geometry column
if not compat.PANDAS_GE_20:
res = df.groupby("value2").sum()
else:
res = df.groupby("value2").sum(numeric_only=True)
exp = pd.DataFrame({"value1": [2, 1], "value2": [1, 2]}, dtype="int64").set_index(
"value2"
)
assert_frame_equal(res, exp)
# applying on the geometry column
res = df.groupby("value2")["geometry"].apply(lambda x: x.union_all())
exp = GeoSeries(
[shapely.geometry.MultiPoint([(0, 0), (2, 2)]), Point(1, 1)],
index=pd.Index([1, 2], name="value2"),
name="geometry",
)
assert_series_equal(res, exp)
# apply on geometry column not resulting in new geometry
res = df.groupby("value2")["geometry"].apply(lambda x: x.union_all().area)
exp = pd.Series([0.0, 0.0], index=pd.Index([1, 2], name="value2"), name="geometry")
assert_series_equal(res, exp)
def test_groupby_groups(df):
g = df.groupby("value2")
res = g.get_group(1)
assert isinstance(res, GeoDataFrame)
exp = df.loc[[0, 2]]
assert_frame_equal(res, exp)
@pytest.mark.parametrize("crs", [None, "EPSG:4326"])
@pytest.mark.parametrize("geometry_name", ["geometry", "geom"])
def test_groupby_metadata(crs, geometry_name):
if crs and not compat.HAS_PYPROJ:
pytest.skip("requires pyproj")
# https://github.com/geopandas/geopandas/issues/2294
df = GeoDataFrame(
{
geometry_name: [Point(0, 0), Point(1, 1), Point(0, 0)],
"value1": np.arange(3, dtype="int64"),
"value2": np.array([1, 2, 1], dtype="int64"),
},
crs=crs,
geometry=geometry_name,
)
kwargs = {}
if compat.PANDAS_GE_22:
# pandas is deprecating that the group key is present as column in the
# dataframe passed to `func`. To suppress this warning, it introduced
# a new include_groups keyword
kwargs = dict(include_groups=False)
# dummy test asserting we can access the crs
def func(group):
assert isinstance(group, GeoDataFrame)
assert group.crs == crs
df.groupby("value2").apply(func, **kwargs)
# selecting the non-group columns -> no need to pass the keyword
if (
compat.PANDAS_GE_22
or (compat.PANDAS_GE_20 and geometry_name == "geometry")
or not compat.PANDAS_GE_20
):
df.groupby("value2")[[geometry_name, "value1"]].apply(func)
else:
# https://github.com/geopandas/geopandas/pull/2966#issuecomment-1878816712
# with pandas 2.0 and 2.1 with geom col != geometry this is failing
with pytest.raises(AttributeError):
df.groupby("value2")[[geometry_name, "value1"]].apply(func)
# actual test with functionality
res = df.groupby("value2").apply(
lambda x: geopandas.sjoin(x, x[[geometry_name, "value1"]], how="inner"),
**kwargs,
)
expected = (
df.take([0, 0, 2, 2, 1])
.set_index("value2", drop=compat.PANDAS_GE_22, append=True)
.swaplevel()
.rename(columns={"value1": "value1_left"})
.assign(value1_right=[0, 2, 0, 2, 1])
)
assert_geodataframe_equal(res.drop(columns=["index_right"]), expected)
def test_apply(s):
# function that returns geometry preserves GeoSeries class
def geom_func(geom):
assert isinstance(geom, Point)
return geom
result = s.apply(geom_func)
assert isinstance(result, GeoSeries)
assert_geoseries_equal(result, s)
# function that returns non-geometry results in Series
def numeric_func(geom):
assert isinstance(geom, Point)
return geom.x
result = s.apply(numeric_func)
assert not isinstance(result, GeoSeries)
assert_series_equal(result, pd.Series([0.0, 1.0, 2.0]))
def test_apply_loc_len1(df):
# subset of len 1 with loc -> bug in pandas with inconsistent Block ndim
# resulting in bug in apply
# https://github.com/geopandas/geopandas/issues/1078
subset = df.loc[[0], "geometry"]
result = subset.apply(lambda geom: geom.is_empty)
expected = subset.is_empty
np.testing.assert_allclose(result, expected)
@pytest.mark.skipif(compat.PANDAS_GE_30, reason="convert_dtype is removed in pandas 3")
def test_apply_convert_dtypes_keyword(s):
# ensure the convert_dtypes keyword is accepted
if not compat.PANDAS_GE_21:
recorder = warnings.catch_warnings(record=True)
else:
recorder = pytest.warns()
with recorder as record:
res = s.apply(lambda x: x, convert_dtype=True, args=())
assert_geoseries_equal(res, s)
if compat.PANDAS_GE_21:
assert len(record) == 1
assert "the convert_dtype parameter" in str(record[0].message)
else:
assert len(record) == 0
@pytest.mark.parametrize("crs", [None, "EPSG:4326"])
def test_apply_no_geometry_result(df, crs):
if crs:
if not compat.HAS_PYPROJ:
pytest.skip("requires pyproj")
df = df.set_crs(crs)
result = df.apply(lambda col: col.astype(str), axis=0)
assert type(result) is pd.DataFrame
expected = df.astype(str)
assert_frame_equal(result, expected)
result = df.apply(lambda col: col.astype(str), axis=1)
assert type(result) is pd.DataFrame
assert_frame_equal(result, expected)
def test_apply_preserves_geom_col_name(df):
df = df.rename_geometry("geom")
result = df.apply(lambda col: col, axis=0)
assert result.geometry.name == "geom"
def test_df_apply_returning_series(df):
# https://github.com/geopandas/geopandas/issues/2283
result = df.apply(lambda row: row.geometry, axis=1)
assert_geoseries_equal(result, df.geometry, check_crs=False)
result = df.apply(lambda row: row.value1, axis=1)
assert_series_equal(result, df["value1"].rename(None))
# https://github.com/geopandas/geopandas/issues/2480
result = df.apply(lambda x: float("NaN"), axis=1)
assert result.dtype == "float64"
# assert list of nones is not promoted to GeometryDtype
result = df.apply(lambda x: None, axis=1)
assert result.dtype == "object"
# https://github.com/geopandas/geopandas/issues/2889
# contrived case such that `from_shapely` receives an array of geodataframes
res = df.apply(lambda row: df.geometry.to_frame(), axis=1)
assert res.dtype == "object"
def test_df_apply_geometry_dtypes(df):
# https://github.com/geopandas/geopandas/issues/1852
apply_types = []
def get_dtypes(srs):
apply_types.append((srs.name, type(srs)))
df["geom2"] = df.geometry
df.apply(get_dtypes)
expected = [
("geometry", GeoSeries),
("value1", pd.Series),
("value2", pd.Series),
("geom2", GeoSeries),
]
assert apply_types == expected
def test_pivot(df):
# https://github.com/geopandas/geopandas/issues/2057
# pivot failing due to creating a MultiIndex
result = df.pivot(columns="value1")
expected = GeoDataFrame(pd.DataFrame(df).pivot(columns="value1"))
assert_geodataframe_equal(result, expected)
def test_preserve_attrs(df):
# https://github.com/geopandas/geopandas/issues/1654
df.attrs["name"] = "my_name"
attrs = {"name": "my_name"}
assert df.attrs == attrs
# preserve attrs in indexing operations
for subset in [df[:2], df[df["value1"] > 2], df[["value2", "geometry"]]]:
assert df.attrs == attrs
# preserve attrs in methods
df2 = df.reset_index()
assert df2.attrs == attrs
# https://github.com/geopandas/geopandas/issues/1875
df3 = df2.explode(index_parts=True)
assert df3.attrs == attrs
def test_preserve_flags(df):
# https://github.com/geopandas/geopandas/issues/1654
df = df.set_flags(allows_duplicate_labels=False)
assert df.flags.allows_duplicate_labels is False
# preserve flags in indexing operations
for subset in [df[:2], df[df["value1"] > 2], df[["value2", "geometry"]]]:
assert df.flags.allows_duplicate_labels is False
# preserve attrs in methods
df2 = df.reset_index()
assert df2.flags.allows_duplicate_labels is False
# it is honored for operations that introduce duplicate labels
with pytest.raises(ValueError):
df.reindex([0, 0, 1])
with pytest.raises(ValueError):
df[["value1", "value1", "geometry"]]
with pytest.raises(ValueError):
pd.concat([df, df])
def test_ufunc():
# this is calling a shapely ufunc, but we currently rely on pandas' implementation
# of `__array_ufunc__` to wrap the result back into a GeoSeries
ser = GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)])
result = shapely.buffer(ser, 2)
assert isinstance(result, GeoSeries)
# ensure the result is still writeable
# (https://github.com/geopandas/geopandas/issues/3178)
assert result.array._data.flags.writeable
result.loc[0] = Point(10, 10)
assert result.iloc[0] == Point(10, 10)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,51 @@
from geopandas.tools._show_versions import (
_get_C_info,
_get_deps_info,
_get_sys_info,
show_versions,
)
def test_get_sys_info():
sys_info = _get_sys_info()
assert "python" in sys_info
assert "executable" in sys_info
assert "machine" in sys_info
def test_get_c_info():
C_info = _get_C_info()
assert "GEOS" in C_info
assert "GEOS lib" in C_info
assert "GDAL" in C_info
assert "GDAL data dir" in C_info
assert "PROJ" in C_info
assert "PROJ data dir" in C_info
def test_get_deps_info():
deps_info = _get_deps_info()
assert "geopandas" in deps_info
assert "pandas" in deps_info
assert "fiona" in deps_info
assert "numpy" in deps_info
assert "shapely" in deps_info
assert "pyproj" in deps_info
assert "matplotlib" in deps_info
assert "mapclassify" in deps_info
assert "geopy" in deps_info
assert "psycopg" in deps_info
assert "psycopg2" in deps_info
assert "geoalchemy2" in deps_info
def test_show_versions(capsys):
show_versions()
out, err = capsys.readouterr()
assert "python" in out
assert "GEOS" in out
assert "geopandas" in out
@@ -0,0 +1,959 @@
from math import sqrt
import numpy as np
import shapely
from shapely.geometry import (
GeometryCollection,
LineString,
MultiPolygon,
Point,
Polygon,
box,
)
import geopandas
from geopandas import GeoDataFrame, GeoSeries, read_file
from geopandas import _compat as compat
import pytest
from numpy.testing import assert_array_equal
class TestSeriesSindex:
def test_has_sindex(self):
"""Test the has_sindex method."""
t1 = Polygon([(0, 0), (1, 0), (1, 1)])
t2 = Polygon([(0, 0), (1, 1), (0, 1)])
d = GeoDataFrame({"geom": [t1, t2]}, geometry="geom")
assert not d.has_sindex
d.sindex
assert d.has_sindex
d.geometry.values._sindex = None
assert not d.has_sindex
d.sindex
assert d.has_sindex
s = GeoSeries([t1, t2])
assert not s.has_sindex
s.sindex
assert s.has_sindex
s.values._sindex = None
assert not s.has_sindex
s.sindex
assert s.has_sindex
def test_empty_geoseries(self):
"""Tests creating a spatial index from an empty GeoSeries."""
s = GeoSeries(dtype=object)
assert not s.sindex
assert len(s.sindex) == 0
def test_point(self):
s = GeoSeries([Point(0, 0)])
assert s.sindex.size == 1
hits = s.sindex.intersection((-1, -1, 1, 1))
assert len(list(hits)) == 1
hits = s.sindex.intersection((-2, -2, -1, -1))
assert len(list(hits)) == 0
def test_empty_point(self):
"""Tests that a single empty Point results in an empty tree."""
s = GeoSeries([Point()])
assert not s.sindex
assert len(s.sindex) == 0
def test_polygons(self):
t1 = Polygon([(0, 0), (1, 0), (1, 1)])
t2 = Polygon([(0, 0), (1, 1), (0, 1)])
sq = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
s = GeoSeries([t1, t2, sq])
assert s.sindex.size == 3
@pytest.mark.filterwarnings("ignore:The series.append method is deprecated")
@pytest.mark.skipif(compat.PANDAS_GE_20, reason="append removed in pandas 2.0")
def test_polygons_append(self):
t1 = Polygon([(0, 0), (1, 0), (1, 1)])
t2 = Polygon([(0, 0), (1, 1), (0, 1)])
sq = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
s = GeoSeries([t1, t2, sq])
t = GeoSeries([t1, t2, sq], [3, 4, 5])
s = s.append(t)
assert len(s) == 6
assert s.sindex.size == 6
def test_lazy_build(self):
s = GeoSeries([Point(0, 0)])
assert s.values._sindex is None
assert s.sindex.size == 1
assert s.values._sindex is not None
def test_rebuild_on_item_change(self):
s = GeoSeries([Point(0, 0)])
original_index = s.sindex
s.iloc[0] = Point(0, 0)
assert s.sindex is not original_index
def test_rebuild_on_slice(self):
s = GeoSeries([Point(0, 0), Point(0, 0)])
original_index = s.sindex
# Select a couple of rows
sliced = s.iloc[:1]
assert sliced.sindex is not original_index
# Select all rows
sliced = s.iloc[:]
assert sliced.sindex is original_index
# Select all rows and flip
sliced = s.iloc[::-1]
assert sliced.sindex is not original_index
class TestFrameSindex:
def setup_method(self):
data = {
"A": range(5),
"B": range(-5, 0),
"geom": [Point(x, y) for x, y in zip(range(5), range(5))],
}
self.df = GeoDataFrame(data, geometry="geom")
def test_sindex(self):
self.df.crs = "epsg:4326"
assert self.df.sindex.size == 5
hits = list(self.df.sindex.intersection((2.5, 2.5, 4, 4)))
assert len(hits) == 2
assert hits[0] == 3
def test_lazy_build(self):
assert self.df.geometry.values._sindex is None
assert self.df.sindex.size == 5
assert self.df.geometry.values._sindex is not None
def test_sindex_rebuild_on_set_geometry(self):
# First build the sindex
assert self.df.sindex is not None
original_index = self.df.sindex
self.df.set_geometry(
[Point(x, y) for x, y in zip(range(5, 10), range(5, 10))], inplace=True
)
assert self.df.sindex is not original_index
def test_rebuild_on_row_slice(self):
# Select a subset of rows rebuilds
original_index = self.df.sindex
sliced = self.df.iloc[:1]
assert sliced.sindex is not original_index
# Slicing all does not rebuild
original_index = self.df.sindex
sliced = self.df.iloc[:]
assert sliced.sindex is original_index
# Re-ordering rebuilds
sliced = self.df.iloc[::-1]
assert sliced.sindex is not original_index
def test_rebuild_on_single_col_selection(self):
"""Selecting a single column should not rebuild the spatial index."""
# Selecting geometry column preserves the index
original_index = self.df.sindex
geometry_col = self.df["geom"]
assert geometry_col.sindex is original_index
geometry_col = self.df.geometry
assert geometry_col.sindex is original_index
def test_rebuild_on_multiple_col_selection(self):
"""Selecting a subset of columns preserves the index."""
original_index = self.df.sindex
# Selecting a subset of columns preserves the index for pandas < 2.0
# with pandas 2.0, the column is now copied, losing the index. But
# with pandas >= 3.0 and Copy-on-Write this is preserved again
subset1 = self.df[["geom", "A"]]
if compat.PANDAS_GE_20 and not compat.PANDAS_GE_30:
assert subset1.sindex is not original_index
else:
assert subset1.sindex is original_index
subset2 = self.df[["A", "geom"]]
if compat.PANDAS_GE_20 and not compat.PANDAS_GE_30:
assert subset2.sindex is not original_index
else:
assert subset2.sindex is original_index
def test_rebuild_on_update_inplace(self):
gdf = self.df.copy()
old_sindex = gdf.sindex
# sorting in place
gdf.sort_values("A", ascending=False, inplace=True)
# spatial index should be invalidated
assert not gdf.has_sindex
new_sindex = gdf.sindex
# and should be different
assert new_sindex is not old_sindex
# sorting should still have happened though
assert gdf.index.tolist() == [4, 3, 2, 1, 0]
def test_update_inplace_no_rebuild(self):
gdf = self.df.copy()
old_sindex = gdf.sindex
gdf.rename(columns={"A": "AA"}, inplace=True)
# a rename shouldn't invalidate the index
assert gdf.has_sindex
# and the "new" should be the same
new_sindex = gdf.sindex
assert old_sindex is new_sindex
# Skip to accommodate Shapely geometries being unhashable # TODO unskip?
@pytest.mark.skip
@pytest.mark.usefixtures("_setup_class_nybb_filename")
class TestJoinSindex:
def setup_method(self):
self.boros = read_file(self.nybb_filename)
def test_merge_geo(self):
# First check that we gets hits from the boros frame.
tree = self.boros.sindex
hits = tree.intersection((1012821.80, 229228.26))
res = [self.boros.iloc[hit]["BoroName"] for hit in hits]
assert res == ["Bronx", "Queens"]
# Check that we only get the Bronx from this view.
first = self.boros[self.boros["BoroCode"] < 3]
tree = first.sindex
hits = tree.intersection((1012821.80, 229228.26))
res = [first.iloc[hit]["BoroName"] for hit in hits]
assert res == ["Bronx"]
# Check that we only get Queens from this view.
second = self.boros[self.boros["BoroCode"] >= 3]
tree = second.sindex
hits = tree.intersection((1012821.80, 229228.26))
res = ([second.iloc[hit]["BoroName"] for hit in hits],)
assert res == ["Queens"]
# Get both the Bronx and Queens again.
merged = first.merge(second, how="outer")
assert len(merged) == 5
assert merged.sindex.size == 5
tree = merged.sindex
hits = tree.intersection((1012821.80, 229228.26))
res = [merged.iloc[hit]["BoroName"] for hit in hits]
assert res == ["Bronx", "Queens"]
class TestShapelyInterface:
def setup_method(self):
data = {
"geom": [Point(x, y) for x, y in zip(range(5), range(5))]
+ [box(10, 10, 20, 20)] # include a box geometry
}
self.df = GeoDataFrame(data, geometry="geom")
self.expected_size = len(data["geom"])
# --------------------------- `intersection` tests -------------------------- #
@pytest.mark.parametrize(
"test_geom, expected",
(
((-1, -1, -0.5, -0.5), []),
((-0.5, -0.5, 0.5, 0.5), [0]),
((0, 0, 1, 1), [0, 1]),
((0, 0), [0]),
),
)
def test_intersection_bounds_tuple(self, test_geom, expected):
"""Tests the `intersection` method with valid inputs."""
res = list(self.df.sindex.intersection(test_geom))
assert_array_equal(res, expected)
@pytest.mark.parametrize("test_geom", ((-1, -1, -0.5), -0.5, None, Point(0, 0)))
def test_intersection_invalid_bounds_tuple(self, test_geom):
"""Tests the `intersection` method with invalid inputs."""
with pytest.raises(TypeError):
# we raise a useful TypeError
self.df.sindex.intersection(test_geom)
# ------------------------------ `query` tests ------------------------------ #
@pytest.mark.parametrize(
"predicate, test_geom, expected",
(
(None, box(-1, -1, -0.5, -0.5), []), # bbox does not intersect
(None, box(-0.5, -0.5, 0.5, 0.5), [0]), # bbox intersects
(None, box(0, 0, 1, 1), [0, 1]), # bbox intersects multiple
(
None,
LineString([(0, 1), (1, 0)]),
[0, 1],
), # bbox intersects but not geometry
("intersects", box(-1, -1, -0.5, -0.5), []), # bbox does not intersect
(
"intersects",
box(-0.5, -0.5, 0.5, 0.5),
[0],
), # bbox and geometry intersect
(
"intersects",
box(0, 0, 1, 1),
[0, 1],
), # bbox and geometry intersect multiple
(
"intersects",
LineString([(0, 1), (1, 0)]),
[],
), # bbox intersects but not geometry
("within", box(0.25, 0.28, 0.75, 0.75), []), # does not intersect
("within", box(0, 0, 10, 10), []), # intersects but is not within
("within", box(11, 11, 12, 12), [5]), # intersects and is within
("within", LineString([(0, 1), (1, 0)]), []), # intersects but not within
("contains", box(0, 0, 1, 1), []), # intersects but does not contain
("contains", box(0, 0, 1.001, 1.001), [1]), # intersects and contains
("contains", box(0.5, 0.5, 1.5, 1.5), [1]), # intersects and contains
("contains", box(-1, -1, 2, 2), [0, 1]), # intersects and contains multiple
(
"contains",
LineString([(0, 1), (1, 0)]),
[],
), # intersects but not contains
("touches", box(-1, -1, 0, 0), [0]), # bbox intersects and touches
(
"touches",
box(-0.5, -0.5, 1.5, 1.5),
[],
), # bbox intersects but geom does not touch
(
"contains",
box(10, 10, 20, 20),
[5],
), # contains but does not contains_properly
(
"covers",
box(-0.5, -0.5, 1, 1),
[0, 1],
), # covers (0, 0) and (1, 1)
(
"covers",
box(0.001, 0.001, 0.99, 0.99),
[],
), # does not cover any
(
"covers",
box(0, 0, 1, 1),
[0, 1],
), # covers but does not contain
(
"contains_properly",
box(0, 0, 1, 1),
[],
), # intersects but does not contain
(
"contains_properly",
box(0, 0, 1.001, 1.001),
[1],
), # intersects 2 and contains 1
(
"contains_properly",
box(0.5, 0.5, 1.001, 1.001),
[1],
), # intersects 1 and contains 1
(
"contains_properly",
box(0.5, 0.5, 1.5, 1.5),
[1],
), # intersects and contains
(
"contains_properly",
box(-1, -1, 2, 2),
[0, 1],
), # intersects and contains multiple
(
"contains_properly",
box(10, 10, 20, 20),
[],
), # contains but does not contains_properly
),
)
def test_query(self, predicate, test_geom, expected):
"""Tests the `query` method with valid inputs and valid predicates."""
res = self.df.sindex.query(test_geom, predicate=predicate)
assert_array_equal(res, expected)
def test_query_invalid_geometry(self):
"""Tests the `query` method with invalid geometry."""
with pytest.raises(TypeError):
self.df.sindex.query("notavalidgeom")
@pytest.mark.skipif(not compat.GEOS_GE_310, reason="Requires GEOS 3.10")
@pytest.mark.parametrize(
"distance, test_geom, expected",
(
# bounds don't intersect and not within distance=0
(
0,
box(9.0, 9.0, 9.9, 9.9),
[],
),
# bounds don't intersect but is within distance=1
(
1,
box(9.0, 9.0, 9.9, 9.9),
[5],
),
# within 1-D absolute distance in both axes, but not euclidean distance
(
0.5,
Point(0.5, 0.5),
[],
),
# same as before but within euclidean distance
(
sqrt(2 * 0.5**2) + 1e-9,
Point(0.5, 0.5),
[0, 1],
),
# less than euclidean distance between points, multi-object
(
sqrt(2) - 1e-9,
[
Polygon([(0, 0), (1, 0), (1, 1)]),
Polygon([(1, 1), (2, 1), (2, 2)]),
], # multi-object test
[[0, 0, 1, 1], [0, 1, 1, 2]],
),
# more than euclidean distance between points, multi-object
(
sqrt(2) + 1e-9,
[
Polygon([(0, 0), (1, 0), (1, 1)]),
Polygon([(1, 1), (2, 1), (2, 2)]),
],
[[0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 0, 1, 2, 3]],
),
# distance is array-like, broadcastable to geometry
(
[2, 10],
[Point(0.5, 0.5), Point(1, 1)],
[[0, 0, 1, 1, 1, 1, 1], [0, 1, 0, 1, 2, 3, 4]],
),
),
)
def test_query_dwithin(self, distance, test_geom, expected):
"""Tests the `query` method with predicates that require keyword arguments."""
res = self.df.sindex.query(test_geom, predicate="dwithin", distance=distance)
assert_array_equal(res, expected)
@pytest.mark.skipif(not compat.GEOS_GE_310, reason="Requires GEOS 3.10")
def test_dwithin_no_distance(self):
"""Tests the `query` method with keyword arguments that are
invalid for certain predicates."""
with pytest.raises(
ValueError, match="'distance' parameter is required for 'dwithin' predicate"
):
self.df.sindex.query(Point(0, 0), predicate="dwithin")
@pytest.mark.parametrize(
"predicate",
[
None,
"contains",
"contains_properly",
"covered_by",
"covers",
"crosses",
"intersects",
"overlaps",
"touches",
"within",
],
)
def test_query_distance_invalid(self, predicate):
"""Tests the `query` method with keyword arguments that are
invalid for certain predicates."""
msg = "'distance' parameter is only supported in combination with 'dwithin'"
with pytest.raises(ValueError, match=msg):
self.df.sindex.query(Point(0, 0), predicate=predicate, distance=0)
@pytest.mark.skipif(
compat.GEOS_GE_310, reason="Test for 'dwithin'-incompatible versions of GEOS"
)
def test_dwithin_requirements(self):
"""Tests whether a ValueError is raised when trying to use dwithin with
incompatible versions of shapely or pyGEOS
"""
with pytest.raises(
ValueError, match="predicate = 'dwithin' requires GEOS >= 3.10.0"
):
self.df.sindex.query(Point(0, 0), predicate="dwithin", distance=0)
@pytest.mark.parametrize(
"test_geom, expected_value",
[
(None, []),
(GeometryCollection(), []),
(Point(), []),
(MultiPolygon(), []),
(Polygon(), []),
],
)
def test_query_empty_geometry(self, test_geom, expected_value):
"""Tests the `query` method with empty geometry."""
res = self.df.sindex.query(test_geom)
assert_array_equal(res, expected_value)
def test_query_invalid_predicate(self):
"""Tests the `query` method with invalid predicates."""
test_geom = box(-1, -1, -0.5, -0.5)
with pytest.raises(ValueError):
self.df.sindex.query(test_geom, predicate="test")
@pytest.mark.parametrize(
"sort, expected",
(
(True, [[0, 0, 0], [0, 1, 2]]),
# False could be anything, at least we'll know if it changes
(False, [[0, 0, 0], [0, 1, 2]]),
),
)
def test_query_sorting(self, sort, expected):
"""Check that results from `query` don't depend on the
order of geometries.
"""
# these geometries come from a reported issue:
# https://github.com/geopandas/geopandas/issues/1337
# there is no theoretical reason they were chosen
test_polys = GeoSeries([Polygon([(1, 1), (3, 1), (3, 3), (1, 3)])])
tree_polys = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(-1, 1), (1, 1), (1, 3), (-1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
expected = [0, 1, 2]
test_geo = test_polys.values[0]
res = tree_polys.sindex.query(test_geo, sort=sort)
# asserting the same elements
assert sorted(res) == sorted(expected)
# asserting the exact array can fail if sort=False
try:
assert_array_equal(res, expected)
except AssertionError as e:
if sort is False:
pytest.xfail(
"rtree results are known to be unordered, see "
"https://github.com/geopandas/geopandas/issues/1337\n"
"Expected:\n {}\n".format(expected)
+ "Got:\n {}\n".format(res.tolist())
)
raise e
# ------------------------- `query_bulk` tests -------------------------- #
@pytest.mark.parametrize(
"predicate, test_geom, expected",
(
(None, [(-1, -1, -0.5, -0.5)], [[], []]),
(None, [(-0.5, -0.5, 0.5, 0.5)], [[0], [0]]),
(None, [(0, 0, 1, 1)], [[0, 0], [0, 1]]),
("intersects", [(-1, -1, -0.5, -0.5)], [[], []]),
("intersects", [(-0.5, -0.5, 0.5, 0.5)], [[0], [0]]),
("intersects", [(0, 0, 1, 1)], [[0, 0], [0, 1]]),
# only second geom intersects
("intersects", [(-1, -1, -0.5, -0.5), (-0.5, -0.5, 0.5, 0.5)], [[1], [0]]),
# both geoms intersect
(
"intersects",
[(-1, -1, 1, 1), (-0.5, -0.5, 0.5, 0.5)],
[[0, 0, 1], [0, 1, 0]],
),
("within", [(0.25, 0.28, 0.75, 0.75)], [[], []]), # does not intersect
("within", [(0, 0, 10, 10)], [[], []]), # intersects but is not within
("within", [(11, 11, 12, 12)], [[0], [5]]), # intersects and is within
(
"contains",
[(0, 0, 1, 1)],
[[], []],
), # intersects and covers, but does not contain
(
"contains",
[(0, 0, 1.001, 1.001)],
[[0], [1]],
), # intersects 2 and contains 1
(
"contains",
[(0.5, 0.5, 1.001, 1.001)],
[[0], [1]],
), # intersects 1 and contains 1
("contains", [(0.5, 0.5, 1.5, 1.5)], [[0], [1]]), # intersects and contains
(
"contains",
[(-1, -1, 2, 2)],
[[0, 0], [0, 1]],
), # intersects and contains multiple
(
"contains",
[(10, 10, 20, 20)],
[[0], [5]],
), # contains but does not contains_properly
("touches", [(-1, -1, 0, 0)], [[0], [0]]), # bbox intersects and touches
(
"touches",
[(-0.5, -0.5, 1.5, 1.5)],
[[], []],
), # bbox intersects but geom does not touch
(
"covers",
[(-0.5, -0.5, 1, 1)],
[[0, 0], [0, 1]],
), # covers (0, 0) and (1, 1)
(
"covers",
[(0.001, 0.001, 0.99, 0.99)],
[[], []],
), # does not cover any
(
"covers",
[(0, 0, 1, 1)],
[[0, 0], [0, 1]],
), # covers but does not contain
(
"contains_properly",
[(0, 0, 1, 1)],
[[], []],
), # intersects but does not contain
(
"contains_properly",
[(0, 0, 1.001, 1.001)],
[[0], [1]],
), # intersects 2 and contains 1
(
"contains_properly",
[(0.5, 0.5, 1.001, 1.001)],
[[0], [1]],
), # intersects 1 and contains 1
(
"contains_properly",
[(0.5, 0.5, 1.5, 1.5)],
[[0], [1]],
), # intersects and contains
(
"contains_properly",
[(-1, -1, 2, 2)],
[[0, 0], [0, 1]],
), # intersects and contains multiple
(
"contains_properly",
[(10, 10, 20, 20)],
[[], []],
), # contains but does not contains_properly
),
)
def test_query_bulk(self, predicate, test_geom, expected):
"""Tests the `query` method with valid
inputs and valid predicates.
"""
res = self.df.sindex.query(
[box(*geom) for geom in test_geom], predicate=predicate
)
assert_array_equal(res, expected)
@pytest.mark.parametrize(
"test_geoms, expected_value",
[
# single empty geometry
([GeometryCollection()], [[], []]),
# None should be skipped
([GeometryCollection(), None], [[], []]),
([None], [[], []]),
([None, box(-0.5, -0.5, 0.5, 0.5), None], [[1], [0]]),
],
)
def test_query_bulk_empty_geometry(self, test_geoms, expected_value):
"""Tests the `query` method with an empty geometries."""
res = self.df.sindex.query(test_geoms)
assert_array_equal(res, expected_value)
def test_query_bulk_empty_input_array(self):
"""Tests the `query` method with an empty input array."""
test_array = np.array([], dtype=object)
expected_value = [[], []]
res = self.df.sindex.query(test_array)
assert_array_equal(res, expected_value)
def test_query_bulk_invalid_input_geometry(self):
"""
Tests the `query` method with invalid input for the `geometry` parameter.
"""
test_array = "notanarray"
with pytest.raises(TypeError):
self.df.sindex.query(test_array)
def test_query_bulk_invalid_predicate(self):
"""Tests the `query` method with invalid predicates."""
test_geom_bounds = (-1, -1, -0.5, -0.5)
test_predicate = "test"
with pytest.raises(ValueError):
self.df.sindex.query([box(*test_geom_bounds)], predicate=test_predicate)
@pytest.mark.parametrize(
"predicate, test_geom, expected",
(
(None, (-1, -1, -0.5, -0.5), [[], []]),
("intersects", (-1, -1, -0.5, -0.5), [[], []]),
("contains", (-1, -1, 1, 1), [[0], [0]]),
),
)
def test_query_bulk_input_type(self, predicate, test_geom, expected):
"""Tests that query can accept a GeoSeries, GeometryArray or
numpy array.
"""
# pass through GeoSeries to test input type
test_geom = geopandas.GeoSeries([box(*test_geom)], index=["0"])
# test GeoSeries
res = self.df.sindex.query(test_geom, predicate=predicate)
assert_array_equal(res, expected)
# test GeometryArray
res = self.df.sindex.query(test_geom.geometry, predicate=predicate)
assert_array_equal(res, expected)
res = self.df.sindex.query(test_geom.geometry.values, predicate=predicate)
assert_array_equal(res, expected)
# test numpy array
res = self.df.sindex.query(
test_geom.geometry.values.to_numpy(), predicate=predicate
)
assert_array_equal(res, expected)
res = self.df.sindex.query(
test_geom.geometry.values.to_numpy(), predicate=predicate
)
assert_array_equal(res, expected)
@pytest.mark.parametrize(
"sort, expected",
(
(True, [[0, 0, 0], [0, 1, 2]]),
# False could be anything, at least we'll know if it changes
(False, [[0, 0, 0], [0, 1, 2]]),
),
)
def test_query_bulk_sorting(self, sort, expected):
"""Check that results from `query` don't depend
on the order of geometries.
"""
# these geometries come from a reported issue:
# https://github.com/geopandas/geopandas/issues/1337
# there is no theoretical reason they were chosen
test_polys = GeoSeries([Polygon([(1, 1), (3, 1), (3, 3), (1, 3)])])
tree_polys = GeoSeries(
[
Polygon([(1, 1), (3, 1), (3, 3), (1, 3)]),
Polygon([(-1, 1), (1, 1), (1, 3), (-1, 3)]),
Polygon([(3, 3), (5, 3), (5, 5), (3, 5)]),
]
)
res = tree_polys.sindex.query(test_polys, sort=sort)
# asserting the same elements
assert sorted(res[0]) == sorted(expected[0])
assert sorted(res[1]) == sorted(expected[1])
# asserting the exact array can fail if sort=False
try:
assert_array_equal(res, expected)
except AssertionError as e:
if sort is False:
pytest.xfail(
"rtree results are known to be unordered, see "
"https://github.com/geopandas/geopandas/issues/1337\n"
"Expected:\n {}\n".format(expected)
+ "Got:\n {}\n".format(res.tolist())
)
raise e
# ------------------------- `nearest` tests ------------------------- #
@pytest.mark.parametrize("return_all", [True, False])
@pytest.mark.parametrize(
"geometry,expected",
[
([0.25, 0.25], [[0], [0]]),
([0.75, 0.75], [[0], [1]]),
],
)
def test_nearest_single(self, geometry, expected, return_all):
geoms = shapely.points(np.arange(10), np.arange(10))
df = geopandas.GeoDataFrame({"geometry": geoms})
p = Point(geometry)
res = df.sindex.nearest(p, return_all=return_all)
assert_array_equal(res, expected)
p = shapely.points(geometry)
res = df.sindex.nearest(p, return_all=return_all)
assert_array_equal(res, expected)
@pytest.mark.parametrize("return_all", [True, False])
@pytest.mark.parametrize(
"geometry,expected",
[
([(1, 1), (0, 0)], [[0, 1], [1, 0]]),
([(1, 1), (0.25, 1)], [[0, 1], [1, 1]]),
],
)
def test_nearest_multi(self, geometry, expected, return_all):
geoms = shapely.points(np.arange(10), np.arange(10))
df = geopandas.GeoDataFrame({"geometry": geoms})
ps = [Point(p) for p in geometry]
res = df.sindex.nearest(ps, return_all=return_all)
assert_array_equal(res, expected)
ps = shapely.points(geometry)
res = df.sindex.nearest(ps, return_all=return_all)
assert_array_equal(res, expected)
s = geopandas.GeoSeries(ps)
res = df.sindex.nearest(s, return_all=return_all)
assert_array_equal(res, expected)
x, y = zip(*geometry)
ga = geopandas.points_from_xy(x, y)
res = df.sindex.nearest(ga, return_all=return_all)
assert_array_equal(res, expected)
@pytest.mark.parametrize("return_all", [True, False])
@pytest.mark.parametrize(
"geometry,expected",
[
(None, [[], []]),
([None], [[], []]),
],
)
def test_nearest_none(self, geometry, expected, return_all):
geoms = shapely.points(np.arange(10), np.arange(10))
df = geopandas.GeoDataFrame({"geometry": geoms})
res = df.sindex.nearest(geometry, return_all=return_all)
assert_array_equal(res, expected)
@pytest.mark.parametrize("return_distance", [True, False])
@pytest.mark.parametrize(
"return_all,max_distance,expected",
[
(True, None, ([[0, 0, 1], [0, 1, 5]], [sqrt(0.5), sqrt(0.5), sqrt(50)])),
(False, None, ([[0, 1], [0, 5]], [sqrt(0.5), sqrt(50)])),
(True, 1, ([[0, 0], [0, 1]], [sqrt(0.5), sqrt(0.5)])),
(False, 1, ([[0], [0]], [sqrt(0.5)])),
],
)
def test_nearest_max_distance(
self, expected, max_distance, return_all, return_distance
):
geoms = shapely.points(np.arange(10), np.arange(10))
df = geopandas.GeoDataFrame({"geometry": geoms})
ps = [Point(0.5, 0.5), Point(0, 10)]
res = df.sindex.nearest(
ps,
return_all=return_all,
max_distance=max_distance,
return_distance=return_distance,
)
if return_distance:
assert_array_equal(res[0], expected[0])
assert_array_equal(res[1], expected[1])
else:
assert_array_equal(res, expected[0])
@pytest.mark.parametrize("return_distance", [True, False])
@pytest.mark.parametrize(
"return_all,max_distance,exclusive,expected",
[
(False, None, False, ([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], 5 * [0])),
(False, None, True, ([[0, 1, 2, 3, 4], [1, 0, 1, 2, 3]], 5 * [sqrt(2)])),
(True, None, False, ([[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], 5 * [0])),
(
True,
None,
True,
([[0, 1, 1, 2, 2, 3, 3, 4], [1, 0, 2, 1, 3, 2, 4, 3]], 8 * [sqrt(2)]),
),
(False, 1.1, True, ([[1, 2, 5], [5, 5, 1]], 3 * [1])),
(True, 1.1, True, ([[1, 2, 5, 5], [5, 5, 1, 2]], 4 * [1])),
],
)
def test_nearest_exclusive(
self, expected, max_distance, return_all, return_distance, exclusive
):
geoms = shapely.points(np.arange(5), np.arange(5))
if max_distance:
# add a non grid point
geoms = np.append(geoms, [Point(1, 2)])
df = geopandas.GeoDataFrame({"geometry": geoms})
ps = geoms
res = df.sindex.nearest(
ps,
return_all=return_all,
max_distance=max_distance,
return_distance=return_distance,
exclusive=exclusive,
)
if return_distance:
assert_array_equal(res[0], expected[0])
assert_array_equal(res[1], expected[1])
else:
assert_array_equal(res, expected[0])
# --------------------------- misc tests ---------------------------- #
def test_empty_tree_geometries(self):
"""Tests building sindex with interleaved empty geometries."""
geoms = [Point(0, 0), None, Point(), Point(1, 1), Point()]
df = geopandas.GeoDataFrame(geometry=geoms)
assert df.sindex.query(Point(1, 1))[0] == 3
def test_size(self):
"""Tests the `size` property."""
assert self.df.sindex.size == self.expected_size
def test_len(self):
"""Tests the `__len__` method of spatial indexes."""
assert len(self.df.sindex) == self.expected_size
def test_is_empty(self):
"""Tests the `is_empty` property."""
# create empty tree
empty = geopandas.GeoSeries([], dtype=object)
assert empty.sindex.is_empty
empty = geopandas.GeoSeries([None])
assert empty.sindex.is_empty
empty = geopandas.GeoSeries([Point()])
assert empty.sindex.is_empty
# create a non-empty tree
non_empty = geopandas.GeoSeries([Point(0, 0)])
assert not non_empty.sindex.is_empty
@pytest.mark.parametrize(
"predicate, expected_shape",
[
(None, (2, 471)),
("intersects", (2, 213)),
("within", (2, 213)),
("contains", (2, 0)),
("overlaps", (2, 0)),
("crosses", (2, 0)),
("touches", (2, 0)),
],
)
def test_integration_natural_earth(
self, predicate, expected_shape, naturalearth_lowres, naturalearth_cities
):
"""Tests output sizes for the naturalearth datasets."""
world = read_file(naturalearth_lowres)
capitals = read_file(naturalearth_cities)
res = world.sindex.query(capitals.geometry, predicate)
assert res.shape == expected_shape
@@ -0,0 +1,186 @@
import warnings
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from shapely.geometry import Point, Polygon
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ
from geopandas.array import from_shapely
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
s1 = GeoSeries(
[
Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]),
Polygon([(2, 2), (4, 2), (4, 4), (2, 4)]),
]
)
s2 = GeoSeries(
[
Polygon([(0, 2), (0, 0), (2, 0), (2, 2)]),
Polygon([(2, 2), (4, 2), (4, 4), (2, 4)]),
]
)
s3 = Series(
[
Polygon([(0, 2), (0, 0), (2, 0), (2, 2)]),
Polygon([(2, 2), (4, 2), (4, 4), (2, 4)]),
]
)
a = from_shapely(
[
Polygon([(0, 2), (0, 0), (2, 0), (2, 2)]),
Polygon([(2, 2), (4, 2), (4, 4), (2, 4)]),
]
)
s4 = Series(a)
df1 = GeoDataFrame({"col1": [1, 2], "geometry": s1})
df2 = GeoDataFrame({"col1": [1, 2], "geometry": s2})
s4 = s1.copy()
s4.array.crs = 4326
s5 = s2.copy()
s5.array.crs = 27700
s6 = GeoSeries(
[
Polygon([(0, 3), (0, 0), (2, 0), (2, 2)]),
Polygon([(2, 2), (4, 2), (4, 4), (2, 4)]),
]
)
df4 = GeoDataFrame(
{"col1": [1, 2], "geometry": s1.copy(), "geom2": s4.copy(), "geom3": s5.copy()},
crs=3857,
)
df5 = GeoDataFrame(
{"col1": [1, 2], "geometry": s1.copy(), "geom3": s5.copy(), "geom2": s4.copy()},
crs=3857,
)
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_geoseries():
assert_geoseries_equal(s1, s2)
assert_geoseries_equal(s1, s3, check_series_type=False, check_dtype=False)
assert_geoseries_equal(s3, s2, check_series_type=False, check_dtype=False)
assert_geoseries_equal(s1, s4, check_series_type=False)
with pytest.raises(AssertionError) as error:
assert_geoseries_equal(s1, s2, check_less_precise=True)
assert "1 out of 2 geometries are not almost equal" in str(error.value)
assert "not almost equal: [0]" in str(error.value)
with pytest.raises(AssertionError) as error:
assert_geoseries_equal(s2, s6, check_less_precise=False)
assert "1 out of 2 geometries are not equal" in str(error.value)
assert "not equal: [0]" in str(error.value)
def test_geodataframe():
assert_geodataframe_equal(df1, df2)
with pytest.raises(AssertionError):
assert_geodataframe_equal(df1, df2, check_less_precise=True)
with pytest.raises(AssertionError):
assert_geodataframe_equal(df1, df2[["geometry", "col1"]])
assert_geodataframe_equal(df1, df2[["geometry", "col1"]], check_like=True)
df3 = df2.copy()
df3.loc[0, "col1"] = 10
with pytest.raises(AssertionError):
assert_geodataframe_equal(df1, df3)
assert_geodataframe_equal(df5, df4, check_like=True)
if HAS_PYPROJ:
df5["geom2"] = df5.geom2.set_crs(3857, allow_override=True)
with pytest.raises(AssertionError):
assert_geodataframe_equal(df5, df4, check_like=True)
def test_equal_nans():
s = GeoSeries([Point(0, 0), np.nan])
assert_geoseries_equal(s, s.copy())
assert_geoseries_equal(s, s.copy(), check_less_precise=True)
def test_no_crs():
df1 = GeoDataFrame({"col1": [1, 2], "geometry": s1}, crs=None)
df2 = GeoDataFrame({"col1": [1, 2], "geometry": s1}, crs={})
assert_geodataframe_equal(df1, df2)
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
def test_ignore_crs_mismatch():
df1 = GeoDataFrame({"col1": [1, 2], "geometry": s1.copy()}, crs="EPSG:4326")
df2 = GeoDataFrame({"col1": [1, 2], "geometry": s1}, crs="EPSG:31370")
with pytest.raises(AssertionError):
assert_geodataframe_equal(df1, df2)
# assert that with `check_crs=False` the assert passes, and also does not
# generate any warning from comparing both geometries with different crs
with warnings.catch_warnings(record=True) as record:
assert_geodataframe_equal(df1, df2, check_crs=False)
assert len(record) == 0
def test_almost_equal_but_not_equal():
s_origin = GeoSeries([Point(0, 0)])
s_almost_origin = GeoSeries([Point(0.0000001, 0)])
assert_geoseries_equal(s_origin, s_almost_origin, check_less_precise=True)
with pytest.raises(AssertionError):
assert_geoseries_equal(s_origin, s_almost_origin)
def test_geodataframe_no_active_geometry_column():
def create_dataframe():
gdf = GeoDataFrame({"value": [1, 2], "geometry": [Point(1, 1), Point(2, 2)]})
gdf["geom2"] = GeoSeries([Point(3, 3), Point(4, 4)])
return gdf
# no active geometry column (None)
df1 = create_dataframe()
df1._geometry_column_name = None
df2 = create_dataframe()
df2._geometry_column_name = None
assert_geodataframe_equal(df1, df2)
# active geometry column ("geometry") not present
df1 = create_dataframe()[["value", "geom2"]]
df2 = create_dataframe()[["value", "geom2"]]
assert_geodataframe_equal(df1, df2)
df1 = GeoDataFrame(create_dataframe()[["value"]])
df2 = GeoDataFrame(create_dataframe()[["value"]])
assert_geodataframe_equal(df1, df2)
def test_geodataframe_multiindex():
def create_dataframe():
gdf = DataFrame([[Point(0, 0), Point(1, 1)], [Point(2, 2), Point(3, 3)]])
gdf = GeoDataFrame(gdf.astype("geometry"))
gdf.columns = pd.MultiIndex.from_product([["geometry"], [0, 1]])
return gdf
df1 = create_dataframe()
df2 = create_dataframe()
assert_geodataframe_equal(df1, df2)
df1 = create_dataframe()
df1._geometry_column_name = None
df2 = create_dataframe()
df2._geometry_column_name = None
assert_geodataframe_equal(df1, df2)
@@ -0,0 +1,85 @@
from pandas import DataFrame, Series
from shapely.geometry import Point
from geopandas import GeoDataFrame, GeoSeries
class TestSeries:
def setup_method(self):
N = self.N = 10
r = 0.5
self.pts = GeoSeries([Point(x, y) for x, y in zip(range(N), range(N))])
self.polys = self.pts.buffer(r)
def test_slice(self):
assert type(self.pts[:2]) is GeoSeries
assert type(self.pts[::2]) is GeoSeries
assert type(self.polys[:2]) is GeoSeries
def test_head(self):
assert type(self.pts.head()) is GeoSeries
def test_tail(self):
assert type(self.pts.tail()) is GeoSeries
def test_sort_index(self):
assert type(self.pts.sort_index()) is GeoSeries
def test_loc(self):
assert type(self.pts.loc[5:]) is GeoSeries
def test_iloc(self):
assert type(self.pts.iloc[5:]) is GeoSeries
def test_fancy(self):
idx = (self.pts.index.to_series() % 2).astype(bool)
assert type(self.pts[idx]) is GeoSeries
def test_take(self):
assert type(self.pts.take(list(range(0, self.N, 2)))) is GeoSeries
def test_groupby(self):
for f, s in self.pts.groupby(lambda x: x % 2):
assert type(s) is GeoSeries
class TestDataFrame:
def setup_method(self):
N = 10
self.df = GeoDataFrame(
[
{"geometry": Point(x, y), "value1": x + y, "value2": x * y}
for x, y in zip(range(N), range(N))
]
)
def test_geometry(self):
assert type(self.df.geometry) is GeoSeries
# still GeoSeries if different name
df2 = GeoDataFrame(
{
"coords": [Point(x, y) for x, y in zip(range(5), range(5))],
"nums": range(5),
},
geometry="coords",
)
assert type(df2.geometry) is GeoSeries
assert type(df2["coords"]) is GeoSeries
def test_nongeometry(self):
assert type(self.df["value1"]) is Series
def test_geometry_multiple(self):
assert type(self.df[["geometry", "value1"]]) is GeoDataFrame
def test_nongeometry_multiple(self):
assert type(self.df[["value1", "value2"]]) is DataFrame
def test_slice(self):
assert type(self.df[:2]) is GeoDataFrame
assert type(self.df[::2]) is GeoDataFrame
def test_fancy(self):
idx = (self.df.index.to_series() % 2).astype(bool)
assert type(self.df[idx]) is GeoDataFrame
@@ -0,0 +1,151 @@
import os.path
from pandas import Series
from geopandas import GeoDataFrame
from geopandas.testing import ( # noqa: F401
assert_geoseries_equal,
geom_almost_equals,
geom_equals,
)
HERE = os.path.abspath(os.path.dirname(__file__))
PACKAGE_DIR = os.path.dirname(os.path.dirname(HERE))
_TEST_DATA_DIR = os.path.join(PACKAGE_DIR, "geopandas", "tests", "data")
_NYBB = "zip://" + os.path.join(_TEST_DATA_DIR, "nybb_16a.zip")
_NATURALEARTH_CITIES = os.path.join(
_TEST_DATA_DIR, "naturalearth_cities", "naturalearth_cities.shp"
)
_NATURALEARTH_LOWRES = os.path.join(
_TEST_DATA_DIR, "naturalearth_lowres", "naturalearth_lowres.shp"
)
# mock not used here, but the import from here is used in other modules
try:
from unittest import mock
except ImportError:
import mock # noqa: F401
def validate_boro_df(df, case_sensitive=False):
"""Tests a GeoDataFrame that has been read in from the nybb dataset."""
assert isinstance(df, GeoDataFrame)
# Make sure all the columns are there and the geometries
# were properly loaded as MultiPolygons
assert len(df) == 5
columns = ("BoroCode", "BoroName", "Shape_Leng", "Shape_Area")
if case_sensitive:
for col in columns:
assert col in df.columns
else:
for col in columns:
assert col.lower() in (dfcol.lower() for dfcol in df.columns)
assert Series(df.geometry.geom_type).dropna().eq("MultiPolygon").all()
def get_srid(df):
"""Return srid from `df.crs`."""
if df.crs is not None:
return df.crs.to_epsg() or 0
return 0
def create_spatialite(con, df):
"""
Return a SpatiaLite connection containing the nybb table.
Parameters
----------
`con`: ``sqlite3.Connection``
`df`: ``GeoDataFrame``
"""
with con:
geom_col = df.geometry.name
srid = get_srid(df)
con.execute(
"CREATE TABLE IF NOT EXISTS nybb "
"( ogc_fid INTEGER PRIMARY KEY"
", borocode INTEGER"
", boroname TEXT"
", shape_leng REAL"
", shape_area REAL"
")"
)
con.execute(
"SELECT AddGeometryColumn(?, ?, ?, ?)",
("nybb", geom_col, srid, df.geom_type.dropna().iat[0].upper()),
)
con.execute("SELECT CreateSpatialIndex(?, ?)", ("nybb", geom_col))
sql_row = "INSERT INTO nybb VALUES(?, ?, ?, ?, ?, GeomFromText(?, ?))"
con.executemany(
sql_row,
(
(
None,
row.BoroCode,
row.BoroName,
row.Shape_Leng,
row.Shape_Area,
row.geometry.wkt if row.geometry else None,
srid,
)
for row in df.itertuples(index=False)
),
)
def create_postgis(con, df, srid=None, geom_col="geom"):
"""
Create a nybb table in the test_geopandas PostGIS database.
Returns a boolean indicating whether the database table was successfully
created
"""
# Try to create the database, skip the db tests if something goes
# wrong
# If you'd like these tests to run, create a database called
# 'test_geopandas' and enable postgis in it:
# > createdb test_geopandas
# > psql -c "CREATE EXTENSION postgis" -d test_geopandas
if srid is not None:
geom_schema = "geometry(MULTIPOLYGON, {})".format(srid)
geom_insert = "ST_SetSRID(ST_GeometryFromText(%s), {})".format(srid)
else:
geom_schema = "geometry"
geom_insert = "ST_GeometryFromText(%s)"
try:
cursor = con.cursor()
cursor.execute("DROP TABLE IF EXISTS nybb;")
sql = """CREATE TABLE nybb (
{geom_col} {geom_schema},
borocode integer,
boroname varchar(40),
shape_leng float,
shape_area float
);""".format(
geom_col=geom_col, geom_schema=geom_schema
)
cursor.execute(sql)
for i, row in df.iterrows():
sql = """INSERT INTO nybb VALUES ({}, %s, %s, %s, %s
);""".format(
geom_insert
)
cursor.execute(
sql,
(
row["geometry"].wkt,
row["BoroCode"],
row["BoroName"],
row["Shape_Leng"],
row["Shape_Area"],
),
)
finally:
cursor.close()
con.commit()
@@ -0,0 +1,15 @@
from .clip import clip
from .geocoding import geocode, reverse_geocode
from .overlay import overlay
from .sjoin import sjoin, sjoin_nearest
from .util import collect
__all__ = [
"collect",
"geocode",
"overlay",
"reverse_geocode",
"sjoin",
"sjoin_nearest",
"clip",
]
@@ -0,0 +1,84 @@
from warnings import warn
import numpy
from shapely.geometry import MultiPoint
from geopandas.array import from_shapely, points_from_xy
from geopandas.geoseries import GeoSeries
def uniform(geom, size, rng=None):
"""
Sample uniformly at random from a geometry.
For polygons, this samples uniformly within the area of the polygon. For lines,
this samples uniformly along the length of the linestring. For multi-part
geometries, the weights of each part are selected according to their relevant
attribute (area for Polygons, length for LineStrings), and then points are
sampled from each part uniformly.
Any other geometry type (e.g. Point, GeometryCollection) are ignored, and an
empty MultiPoint geometry is returned.
Parameters
----------
geom : any shapely.geometry.BaseGeometry type
the shape that describes the area in which to sample.
size : integer
an integer denoting how many points to sample
Returns
-------
shapely.MultiPoint geometry containing the sampled points
Examples
--------
>>> from shapely.geometry import box
>>> square = box(0,0,1,1)
>>> uniform(square, size=102) # doctest: +SKIP
"""
generator = numpy.random.default_rng(seed=rng)
if geom is None or geom.is_empty:
return MultiPoint()
if geom.geom_type in ("Polygon", "MultiPolygon"):
return _uniform_polygon(geom, size=size, generator=generator)
if geom.geom_type in ("LineString", "MultiLineString"):
return _uniform_line(geom, size=size, generator=generator)
warn(
f"Sampling is not supported for {geom.geom_type} geometry type.",
UserWarning,
stacklevel=8,
)
return MultiPoint()
def _uniform_line(geom, size, generator):
"""
Sample points from an input shapely linestring
"""
fracs = generator.uniform(size=size)
return from_shapely(geom.interpolate(fracs, normalized=True)).union_all()
def _uniform_polygon(geom, size, generator):
"""
Sample uniformly from within a polygon using batched sampling.
"""
xmin, ymin, xmax, ymax = geom.bounds
candidates = []
while len(candidates) < size:
batch = points_from_xy(
x=generator.uniform(xmin, xmax, size=size),
y=generator.uniform(ymin, ymax, size=size),
)
valid_samples = batch[batch.sindex.query(geom, predicate="contains")]
candidates.extend(valid_samples)
return GeoSeries(candidates[:size]).union_all()
@@ -0,0 +1,169 @@
import importlib
import platform
import sys
def _get_sys_info():
"""System information
Returns
-------
sys_info : dict
system and Python version information
"""
python = sys.version.replace("\n", " ")
blob = [
("python", python),
("executable", sys.executable),
("machine", platform.platform()),
]
return dict(blob)
def _get_C_info():
"""Information on system PROJ, GDAL, GEOS
Returns
-------
c_info: dict
system PROJ information
"""
try:
import pyproj
proj_version = pyproj.proj_version_str
except Exception:
proj_version = None
try:
import pyproj
proj_dir = pyproj.datadir.get_data_dir()
except Exception:
proj_dir = None
try:
import shapely._buildcfg
geos_version = "{}.{}.{}".format(*shapely._buildcfg.geos_version)
geos_dir = shapely._buildcfg.geos_library_path
except Exception:
try:
from shapely import geos_version_string
geos_version = geos_version_string
geos_dir = None
except Exception:
geos_version = None
geos_dir = None
try:
import pyogrio
gdal_version = pyogrio.__gdal_version_string__
gdal_dir = pyogrio.get_gdal_data_path()
except Exception:
gdal_version = None
gdal_dir = None
if gdal_version is None:
try:
import fiona
gdal_version = fiona.env.get_gdal_release_name()
except Exception:
gdal_version = None
try:
import fiona
gdal_dir = fiona.env.GDALDataFinder().search()
except Exception:
gdal_dir = None
blob = [
("GEOS", geos_version),
("GEOS lib", geos_dir),
("GDAL", gdal_version),
("GDAL data dir", gdal_dir),
("PROJ", proj_version),
("PROJ data dir", proj_dir),
]
return dict(blob)
def _get_deps_info():
"""Overview of the installed version of main dependencies
Returns
-------
deps_info: dict
version information on relevant Python libraries
"""
deps = [
"geopandas",
# required deps
"numpy",
"pandas",
"pyproj",
"shapely",
# optional deps
"pyogrio",
"geoalchemy2",
"geopy",
"matplotlib",
"mapclassify",
"fiona",
"psycopg",
"psycopg2",
"pyarrow",
]
def get_version(module):
return module.__version__
deps_info = {}
for modname in deps:
try:
if modname in sys.modules:
mod = sys.modules[modname]
else:
mod = importlib.import_module(modname)
ver = get_version(mod)
deps_info[modname] = ver
except Exception:
deps_info[modname] = None
return deps_info
def show_versions():
"""
Print system information and installed module versions.
Examples
--------
::
$ python -c "import geopandas; geopandas.show_versions()"
"""
sys_info = _get_sys_info()
deps_info = _get_deps_info()
proj_info = _get_C_info()
maxlen = max(len(x) for x in deps_info)
tpl = "{{k:<{maxlen}}}: {{stat}}".format(maxlen=maxlen)
print("\nSYSTEM INFO")
print("-----------")
for k, stat in sys_info.items():
print(tpl.format(k=k, stat=stat))
print("\nGEOS, GDAL, PROJ INFO")
print("---------------------")
for k, stat in proj_info.items():
print(tpl.format(k=k, stat=stat))
print("\nPYTHON DEPENDENCIES")
print("-------------------")
for k, stat in deps_info.items():
print(tpl.format(k=k, stat=stat))
@@ -0,0 +1,257 @@
"""
geopandas.clip
==============
A module to clip vector data using GeoPandas.
"""
import warnings
import numpy as np
import pandas.api.types
from shapely.geometry import MultiPolygon, Polygon, box
from geopandas import GeoDataFrame, GeoSeries
from geopandas.array import _check_crs, _crs_mismatch_warn
def _mask_is_list_like_rectangle(mask):
return pandas.api.types.is_list_like(mask) and not isinstance(
mask, (GeoDataFrame, GeoSeries, Polygon, MultiPolygon)
)
def _clip_gdf_with_mask(gdf, mask, sort=False):
"""Clip geometry to the polygon/rectangle extent.
Clip an input GeoDataFrame to the polygon extent of the polygon
parameter.
Parameters
----------
gdf : GeoDataFrame, GeoSeries
Dataframe to clip.
mask : (Multi)Polygon, list-like
Reference polygon/rectangle for clipping.
sort : boolean, default False
If True, the results will be sorted in ascending order using the
geometries' indexes as the primary key.
Returns
-------
GeoDataFrame
The returned GeoDataFrame is a clipped subset of gdf
that intersects with polygon/rectangle.
"""
clipping_by_rectangle = _mask_is_list_like_rectangle(mask)
if clipping_by_rectangle:
intersection_polygon = box(*mask)
else:
intersection_polygon = mask
gdf_sub = gdf.iloc[
gdf.sindex.query(intersection_polygon, predicate="intersects", sort=sort)
]
# For performance reasons points don't need to be intersected with poly
non_point_mask = gdf_sub.geom_type != "Point"
if not non_point_mask.any():
# only points, directly return
return gdf_sub
# Clip the data with the polygon
if isinstance(gdf_sub, GeoDataFrame):
clipped = gdf_sub.copy()
if clipping_by_rectangle:
clipped.loc[non_point_mask, clipped._geometry_column_name] = (
gdf_sub.geometry.values[non_point_mask].clip_by_rect(*mask)
)
else:
clipped.loc[non_point_mask, clipped._geometry_column_name] = (
gdf_sub.geometry.values[non_point_mask].intersection(mask)
)
else:
# GeoSeries
clipped = gdf_sub.copy()
if clipping_by_rectangle:
clipped[non_point_mask] = gdf_sub.values[non_point_mask].clip_by_rect(*mask)
else:
clipped[non_point_mask] = gdf_sub.values[non_point_mask].intersection(mask)
if clipping_by_rectangle:
# clip_by_rect might return empty geometry collections in edge cases
clipped = clipped[~clipped.is_empty]
return clipped
def clip(gdf, mask, keep_geom_type=False, sort=False):
"""Clip points, lines, or polygon geometries to the mask extent.
Both layers must be in the same Coordinate Reference System (CRS).
The ``gdf`` will be clipped to the full extent of the clip object.
If there are multiple polygons in mask, data from ``gdf`` will be
clipped to the total boundary of all polygons in mask.
If the ``mask`` is list-like with four elements ``(minx, miny, maxx, maxy)``, a
faster rectangle clipping algorithm will be used. Note that this can lead to
slightly different results in edge cases, e.g. if a line would be reduced to a
point, this point might not be returned.
The geometry is clipped in a fast but possibly dirty way. The output is not
guaranteed to be valid. No exceptions will be raised for topological errors.
Parameters
----------
gdf : GeoDataFrame or GeoSeries
Vector layer (point, line, polygon) to be clipped to mask.
mask : GeoDataFrame, GeoSeries, (Multi)Polygon, list-like
Polygon vector layer used to clip ``gdf``.
The mask's geometry is dissolved into one geometric feature
and intersected with ``gdf``.
If the mask is list-like with four elements ``(minx, miny, maxx, maxy)``,
``clip`` will use a faster rectangle clipping (:meth:`~GeoSeries.clip_by_rect`),
possibly leading to slightly different results.
keep_geom_type : boolean, default False
If True, return only geometries of original type in case of intersection
resulting in multiple geometry types or GeometryCollections.
If False, return all resulting geometries (potentially mixed-types).
sort : boolean, default False
If True, the results will be sorted in ascending order using the
geometries' indexes as the primary key.
Returns
-------
GeoDataFrame or GeoSeries
Vector data (points, lines, polygons) from ``gdf`` clipped to
polygon boundary from mask.
See also
--------
GeoDataFrame.clip : equivalent GeoDataFrame method
GeoSeries.clip : equivalent GeoSeries method
Examples
--------
Clip points (grocery stores) with polygons (the Near West Side community):
>>> import geodatasets
>>> chicago = geopandas.read_file(
... geodatasets.get_path("geoda.chicago_health")
... )
>>> near_west_side = chicago[chicago["community"] == "NEAR WEST SIDE"]
>>> groceries = geopandas.read_file(
... geodatasets.get_path("geoda.groceries")
... ).to_crs(chicago.crs)
>>> groceries.shape
(148, 8)
>>> nws_groceries = geopandas.clip(groceries, near_west_side)
>>> nws_groceries.shape
(7, 8)
"""
if not isinstance(gdf, (GeoDataFrame, GeoSeries)):
raise TypeError(
"'gdf' should be GeoDataFrame or GeoSeries, got {}".format(type(gdf))
)
mask_is_list_like = _mask_is_list_like_rectangle(mask)
if (
not isinstance(mask, (GeoDataFrame, GeoSeries, Polygon, MultiPolygon))
and not mask_is_list_like
):
raise TypeError(
"'mask' should be GeoDataFrame, GeoSeries,"
f"(Multi)Polygon or list-like, got {type(mask)}"
)
if mask_is_list_like and len(mask) != 4:
raise TypeError(
"If 'mask' is list-like, it must have four values (minx, miny, maxx, maxy)"
)
if isinstance(mask, (GeoDataFrame, GeoSeries)):
if not _check_crs(gdf, mask):
_crs_mismatch_warn(gdf, mask, stacklevel=3)
if isinstance(mask, (GeoDataFrame, GeoSeries)):
box_mask = mask.total_bounds
elif mask_is_list_like:
box_mask = mask
else:
# Avoid empty tuple returned by .bounds when geometry is empty. A tuple of
# all nan values is consistent with the behavior of
# {GeoSeries, GeoDataFrame}.total_bounds for empty geometries.
# TODO(shapely) can simpely use mask.bounds once relying on Shapely 2.0
box_mask = mask.bounds if not mask.is_empty else (np.nan,) * 4
box_gdf = gdf.total_bounds
if not (
((box_mask[0] <= box_gdf[2]) and (box_gdf[0] <= box_mask[2]))
and ((box_mask[1] <= box_gdf[3]) and (box_gdf[1] <= box_mask[3]))
):
return gdf.iloc[:0]
if isinstance(mask, (GeoDataFrame, GeoSeries)):
combined_mask = mask.geometry.union_all()
else:
combined_mask = mask
clipped = _clip_gdf_with_mask(gdf, combined_mask, sort=sort)
if keep_geom_type:
geomcoll_concat = (clipped.geom_type == "GeometryCollection").any()
geomcoll_orig = (gdf.geom_type == "GeometryCollection").any()
new_collection = geomcoll_concat and not geomcoll_orig
if geomcoll_orig:
warnings.warn(
"keep_geom_type can not be called on a "
"GeoDataFrame with GeometryCollection.",
stacklevel=2,
)
else:
polys = ["Polygon", "MultiPolygon"]
lines = ["LineString", "MultiLineString", "LinearRing"]
points = ["Point", "MultiPoint"]
# Check that the gdf for multiple geom types (points, lines and/or polys)
orig_types_total = sum(
[
gdf.geom_type.isin(polys).any(),
gdf.geom_type.isin(lines).any(),
gdf.geom_type.isin(points).any(),
]
)
# Check how many geometry types are in the clipped GeoDataFrame
clip_types_total = sum(
[
clipped.geom_type.isin(polys).any(),
clipped.geom_type.isin(lines).any(),
clipped.geom_type.isin(points).any(),
]
)
# Check there aren't any new geom types in the clipped GeoDataFrame
more_types = orig_types_total < clip_types_total
if orig_types_total > 1:
warnings.warn(
"keep_geom_type can not be called on a mixed type GeoDataFrame.",
stacklevel=2,
)
elif new_collection or more_types:
orig_type = gdf.geom_type.iloc[0]
if new_collection:
clipped = clipped.explode(index_parts=False)
if orig_type in polys:
clipped = clipped.loc[clipped.geom_type.isin(polys)]
elif orig_type in lines:
clipped = clipped.loc[clipped.geom_type.isin(lines)]
return clipped
@@ -0,0 +1,184 @@
import time
from collections import defaultdict
import pandas as pd
from shapely.geometry import Point
import geopandas
def _get_throttle_time(provider):
"""
Amount of time to wait between requests to a geocoding API, for providers
that specify rate limits in their terms of service.
"""
import geopy.geocoders
# https://operations.osmfoundation.org/policies/nominatim/
if provider == geopy.geocoders.Nominatim:
return 1
else:
return 0
def geocode(strings, provider=None, **kwargs):
"""
Geocode a set of strings and get a GeoDataFrame of the resulting points.
Parameters
----------
strings : list or Series of addresses to geocode
provider : str or geopy.geocoder
Specifies geocoding service to use. If none is provided,
will use 'photon' (see the Photon's terms of service at:
https://photon.komoot.io).
Either the string name used by geopy (as specified in
geopy.geocoders.SERVICE_TO_GEOCODER) or a geopy Geocoder instance
(e.g., geopy.geocoders.Photon) may be used.
Some providers require additional arguments such as access keys
See each geocoder's specific parameters in geopy.geocoders
Notes
-----
Ensure proper use of the results by consulting the Terms of Service for
your provider.
Geocoding requires geopy. Install it using 'pip install geopy'. See also
https://github.com/geopy/geopy
Examples
--------
>>> df = geopandas.tools.geocode( # doctest: +SKIP
... ["boston, ma", "1600 pennsylvania ave. washington, dc"]
... )
>>> df # doctest: +SKIP
geometry address
0 POINT (-71.05863 42.35899) Boston, MA, United States
1 POINT (-77.03651 38.89766) 1600 Pennsylvania Ave NW, Washington, DC 20006...
"""
if provider is None:
provider = "photon"
throttle_time = _get_throttle_time(provider)
return _query(strings, True, provider, throttle_time, **kwargs)
def reverse_geocode(points, provider=None, **kwargs):
"""
Reverse geocode a set of points and get a GeoDataFrame of the resulting
addresses.
The points
Parameters
----------
points : list or Series of Shapely Point objects.
x coordinate is longitude
y coordinate is latitude
provider : str or geopy.geocoder (opt)
Specifies geocoding service to use. If none is provided,
will use 'photon' (see the Photon's terms of service at:
https://photon.komoot.io).
Either the string name used by geopy (as specified in
geopy.geocoders.SERVICE_TO_GEOCODER) or a geopy Geocoder instance
(e.g., geopy.geocoders.Photon) may be used.
Some providers require additional arguments such as access keys
See each geocoder's specific parameters in geopy.geocoders
Notes
-----
Ensure proper use of the results by consulting the Terms of Service for
your provider.
Reverse geocoding requires geopy. Install it using 'pip install geopy'.
See also https://github.com/geopy/geopy
Examples
--------
>>> from shapely.geometry import Point
>>> df = geopandas.tools.reverse_geocode( # doctest: +SKIP
... [Point(-71.0594869, 42.3584697), Point(-77.0365305, 38.8977332)]
... )
>>> df # doctest: +SKIP
geometry address
0 POINT (-71.05941 42.35837) 29 Court Sq, Boston, MA 02108, United States
1 POINT (-77.03641 38.89766) 1600 Pennsylvania Ave NW, Washington, DC 20006...
"""
if provider is None:
provider = "photon"
throttle_time = _get_throttle_time(provider)
return _query(points, False, provider, throttle_time, **kwargs)
def _query(data, forward, provider, throttle_time, **kwargs):
# generic wrapper for calls over lists to geopy Geocoders
from geopy.geocoders import get_geocoder_for_service
from geopy.geocoders.base import GeocoderQueryError
if forward:
if not isinstance(data, pd.Series):
data = pd.Series(data)
else:
if not isinstance(data, geopandas.GeoSeries):
data = geopandas.GeoSeries(data)
if isinstance(provider, str):
provider = get_geocoder_for_service(provider)
coder = provider(**kwargs)
results = {}
for i, s in data.items():
try:
if forward:
results[i] = coder.geocode(s)
else:
results[i] = coder.reverse((s.y, s.x), exactly_one=True)
except (GeocoderQueryError, ValueError):
results[i] = (None, None)
time.sleep(throttle_time)
df = _prepare_geocode_result(results)
return df
def _prepare_geocode_result(results):
"""
Helper function for the geocode function
Takes a dict where keys are index entries, values are tuples containing:
(address, (lat, lon))
"""
# Prepare the data for the DataFrame as a dict of lists
d = defaultdict(list)
index = []
for i, s in results.items():
if s is None:
p = Point()
address = None
else:
address, loc = s
# loc is lat, lon and we want lon, lat
if loc is None:
p = Point()
else:
p = Point(loc[1], loc[0])
d["geometry"].append(p)
d["address"].append(address)
index.append(i)
df = geopandas.GeoDataFrame(d, index=index, crs="EPSG:4326")
return df
@@ -0,0 +1,188 @@
import numpy as np
def _hilbert_distance(geoms, total_bounds=None, level=16):
"""
Calculate the distance along a Hilbert curve.
The distances are calculated for the midpoints of the geometries in the
GeoDataFrame.
Parameters
----------
geoms : GeometryArray
total_bounds : 4-element array
Total bounds of geometries - array
level : int (1 - 16), default 16
Determines the precision of the curve (points on the curve will
have coordinates in the range [0, 2^level - 1]).
Returns
-------
np.ndarray
Array containing distances along the Hilbert curve
"""
if geoms.is_empty.any() | geoms.isna().any():
raise ValueError(
"Hilbert distance cannot be computed on a GeoSeries with empty or "
"missing geometries.",
)
# Calculate bounds as numpy array
bounds = geoms.bounds
# Calculate discrete coords based on total bounds and bounds
x, y = _continuous_to_discrete_coords(bounds, level, total_bounds)
# Compute distance along hilbert curve
distances = _encode(level, x, y)
return distances
def _continuous_to_discrete_coords(bounds, level, total_bounds):
"""
Calculates mid points & ranges of geoms and returns
as discrete coords
Parameters
----------
bounds : Bounds of each geometry - array
p : The number of iterations used in constructing the Hilbert curve
total_bounds : Total bounds of geometries - array
Returns
-------
Discrete two-dimensional numpy array
Two-dimensional array Array of hilbert distances for each geom
"""
# Hilbert Side length
side_length = (2**level) - 1
# Calculate mid points for x and y bound coords - returns array
x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0
y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0
# Calculate x and y range of total bound coords - returns array
if total_bounds is None:
total_bounds = (
np.nanmin(x_mids),
np.nanmin(y_mids),
np.nanmax(x_mids),
np.nanmax(y_mids),
)
xmin, ymin, xmax, ymax = total_bounds
# Transform continuous value to discrete integer for each dimension
x_int = _continuous_to_discrete(x_mids, (xmin, xmax), side_length)
y_int = _continuous_to_discrete(y_mids, (ymin, ymax), side_length)
return x_int, y_int
def _continuous_to_discrete(vals, val_range, n):
"""
Convert a continuous one-dimensional array to discrete integer values
based their ranges
Parameters
----------
vals : Array of continuous values
val_range : Tuple containing range of continuous values
n : Number of discrete values
Returns
-------
One-dimensional array of discrete ints
"""
width = val_range[1] - val_range[0]
if width == 0:
return np.zeros_like(vals, dtype=np.uint32)
res = (vals - val_range[0]) * (n / width)
np.clip(res, 0, n, out=res)
return res.astype(np.uint32)
# Fast Hilbert curve algorithm by http://threadlocalmutex.com/
# From C++ https://github.com/rawrunprotected/hilbert_curves
# (public domain)
MAX_LEVEL = 16
def _interleave(x):
x = (x | (x << 8)) & 0x00FF00FF
x = (x | (x << 4)) & 0x0F0F0F0F
x = (x | (x << 2)) & 0x33333333
x = (x | (x << 1)) & 0x55555555
return x
def _encode(level, x, y):
x = np.asarray(x, dtype="uint32")
y = np.asarray(y, dtype="uint32")
if level > MAX_LEVEL:
raise ValueError("Level out of range")
x = x << (16 - level)
y = y << (16 - level)
# Initial prefix scan round, prime with x and y
a = x ^ y
b = 0xFFFF ^ a
c = 0xFFFF ^ (x | y)
d = x & (y ^ 0xFFFF)
A = a | (b >> 1)
B = (a >> 1) ^ a
C = ((c >> 1) ^ (b & (d >> 1))) ^ c
D = ((a & (c >> 1)) ^ (d >> 1)) ^ d
a = A.copy()
b = B.copy()
c = C.copy()
d = D.copy()
A = (a & (a >> 2)) ^ (b & (b >> 2))
B = (a & (b >> 2)) ^ (b & ((a ^ b) >> 2))
C ^= (a & (c >> 2)) ^ (b & (d >> 2))
D ^= (b & (c >> 2)) ^ ((a ^ b) & (d >> 2))
a = A.copy()
b = B.copy()
c = C.copy()
d = D.copy()
A = (a & (a >> 4)) ^ (b & (b >> 4))
B = (a & (b >> 4)) ^ (b & ((a ^ b) >> 4))
C ^= (a & (c >> 4)) ^ (b & (d >> 4))
D ^= (b & (c >> 4)) ^ ((a ^ b) & (d >> 4))
# Final round and projection
a = A.copy()
b = B.copy()
c = C.copy()
d = D.copy()
C ^= (a & (c >> 8)) ^ (b & (d >> 8))
D ^= (b & (c >> 8)) ^ ((a ^ b) & (d >> 8))
# Undo transformation prefix scan
a = C ^ (C >> 1)
b = D ^ (D >> 1)
# Recover index bits
i0 = x ^ y
i1 = b | (0xFFFF ^ (i0 | a))
return ((_interleave(i1) << 1) | _interleave(i0)) >> (32 - 2 * level)
@@ -0,0 +1,399 @@
import warnings
from functools import reduce
import numpy as np
import pandas as pd
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import PANDAS_GE_30
from geopandas.array import _check_crs, _crs_mismatch_warn
def _ensure_geometry_column(df):
"""
Helper function to ensure the geometry column is called 'geometry'.
If another column with that name exists, it will be dropped.
"""
if not df._geometry_column_name == "geometry":
if PANDAS_GE_30:
if "geometry" in df.columns:
df = df.drop("geometry", axis=1)
df = df.rename_geometry("geometry")
else:
if "geometry" in df.columns:
df.drop("geometry", axis=1, inplace=True)
df.rename_geometry("geometry", inplace=True)
return df
def _overlay_intersection(df1, df2):
"""
Overlay Intersection operation used in overlay function
"""
# Spatial Index to create intersections
idx1, idx2 = df2.sindex.query(df1.geometry, predicate="intersects", sort=True)
# Create pairs of geometries in both dataframes to be intersected
if idx1.size > 0 and idx2.size > 0:
left = df1.geometry.take(idx1)
left.reset_index(drop=True, inplace=True)
right = df2.geometry.take(idx2)
right.reset_index(drop=True, inplace=True)
intersections = left.intersection(right)
poly_ix = intersections.geom_type.isin(["Polygon", "MultiPolygon"])
intersections.loc[poly_ix] = intersections[poly_ix].make_valid()
# only keep actual intersecting geometries
pairs_intersect = pd.DataFrame({"__idx1": idx1, "__idx2": idx2})
geom_intersect = intersections
# merge data for intersecting geometries
df1 = df1.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
dfinter = pairs_intersect.merge(
df1.drop(df1._geometry_column_name, axis=1),
left_on="__idx1",
right_index=True,
)
dfinter = dfinter.merge(
df2.drop(df2._geometry_column_name, axis=1),
left_on="__idx2",
right_index=True,
suffixes=("_1", "_2"),
)
return GeoDataFrame(dfinter, geometry=geom_intersect, crs=df1.crs)
else:
result = df1.iloc[:0].merge(
df2.iloc[:0].drop(df2.geometry.name, axis=1),
left_index=True,
right_index=True,
suffixes=("_1", "_2"),
)
result["__idx1"] = np.nan
result["__idx2"] = np.nan
return result[
result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name]
]
def _overlay_difference(df1, df2):
"""
Overlay Difference operation used in overlay function
"""
# spatial index query to find intersections
idx1, idx2 = df2.sindex.query(df1.geometry, predicate="intersects", sort=True)
idx1_unique, idx1_unique_indices = np.unique(idx1, return_index=True)
idx2_split = np.split(idx2, idx1_unique_indices[1:])
sidx = [
idx2_split.pop(0) if idx in idx1_unique else []
for idx in range(df1.geometry.size)
]
# Create differences
new_g = []
for geom, neighbours in zip(df1.geometry, sidx):
new = reduce(
lambda x, y: x.difference(y), [geom] + list(df2.geometry.iloc[neighbours])
)
new_g.append(new)
differences = GeoSeries(new_g, index=df1.index, crs=df1.crs)
poly_ix = differences.geom_type.isin(["Polygon", "MultiPolygon"])
differences.loc[poly_ix] = differences[poly_ix].make_valid()
geom_diff = differences[~differences.is_empty].copy()
dfdiff = df1[~differences.is_empty].copy()
dfdiff[dfdiff._geometry_column_name] = geom_diff
return dfdiff
def _overlay_symmetric_diff(df1, df2):
"""
Overlay Symmetric Difference operation used in overlay function
"""
dfdiff1 = _overlay_difference(df1, df2)
dfdiff2 = _overlay_difference(df2, df1)
dfdiff1["__idx1"] = range(len(dfdiff1))
dfdiff2["__idx2"] = range(len(dfdiff2))
dfdiff1["__idx2"] = np.nan
dfdiff2["__idx1"] = np.nan
# ensure geometry name (otherwise merge goes wrong)
dfdiff1 = _ensure_geometry_column(dfdiff1)
dfdiff2 = _ensure_geometry_column(dfdiff2)
# combine both 'difference' dataframes
dfsym = dfdiff1.merge(
dfdiff2, on=["__idx1", "__idx2"], how="outer", suffixes=("_1", "_2")
)
geometry = dfsym.geometry_1.copy()
geometry.name = "geometry"
# https://github.com/pandas-dev/pandas/issues/26468 use loc for now
geometry.loc[dfsym.geometry_1.isnull()] = dfsym.loc[
dfsym.geometry_1.isnull(), "geometry_2"
]
dfsym.drop(["geometry_1", "geometry_2"], axis=1, inplace=True)
dfsym.reset_index(drop=True, inplace=True)
dfsym = GeoDataFrame(dfsym, geometry=geometry, crs=df1.crs)
return dfsym
def _overlay_union(df1, df2):
"""
Overlay Union operation used in overlay function
"""
dfinter = _overlay_intersection(df1, df2)
dfsym = _overlay_symmetric_diff(df1, df2)
dfunion = pd.concat([dfinter, dfsym], ignore_index=True, sort=False)
# keep geometry column last
columns = list(dfunion.columns)
columns.remove("geometry")
columns.append("geometry")
return dfunion.reindex(columns=columns)
def overlay(df1, df2, how="intersection", keep_geom_type=None, make_valid=True):
"""Perform spatial overlay between two GeoDataFrames.
Currently only supports data GeoDataFrames with uniform geometry types,
i.e. containing only (Multi)Polygons, or only (Multi)Points, or a
combination of (Multi)LineString and LinearRing shapes.
Implements several methods that are all effectively subsets of the union.
See the User Guide page :doc:`../../user_guide/set_operations` for details.
Parameters
----------
df1 : GeoDataFrame
df2 : GeoDataFrame
how : string
Method of spatial overlay: 'intersection', 'union',
'identity', 'symmetric_difference' or 'difference'.
keep_geom_type : bool
If True, return only geometries of the same geometry type as df1 has,
if False, return all resulting geometries. Default is None,
which will set keep_geom_type to True but warn upon dropping
geometries.
make_valid : bool, default True
If True, any invalid input geometries are corrected with a call to make_valid(),
if False, a `ValueError` is raised if any input geometries are invalid.
Returns
-------
df : GeoDataFrame
GeoDataFrame with new set of polygons and attributes
resulting from the overlay
Examples
--------
>>> from shapely.geometry import Polygon
>>> polys1 = geopandas.GeoSeries([Polygon([(0,0), (2,0), (2,2), (0,2)]),
... Polygon([(2,2), (4,2), (4,4), (2,4)])])
>>> polys2 = geopandas.GeoSeries([Polygon([(1,1), (3,1), (3,3), (1,3)]),
... Polygon([(3,3), (5,3), (5,5), (3,5)])])
>>> df1 = geopandas.GeoDataFrame({'geometry': polys1, 'df1_data':[1,2]})
>>> df2 = geopandas.GeoDataFrame({'geometry': polys2, 'df2_data':[1,2]})
>>> geopandas.overlay(df1, df2, how='union')
df1_data df2_data geometry
0 1.0 1.0 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
1 2.0 1.0 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
2 2.0 2.0 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
3 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
4 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
5 NaN 1.0 MULTIPOLYGON (((2 3, 2 2, 1 2, 1 3, 2 3)), ((3...
6 NaN 2.0 POLYGON ((3 5, 5 5, 5 3, 4 3, 4 4, 3 4, 3 5))
>>> geopandas.overlay(df1, df2, how='intersection')
df1_data df2_data geometry
0 1 1 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
1 2 1 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
2 2 2 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
>>> geopandas.overlay(df1, df2, how='symmetric_difference')
df1_data df2_data geometry
0 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
1 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
2 NaN 1.0 MULTIPOLYGON (((2 3, 2 2, 1 2, 1 3, 2 3)), ((3...
3 NaN 2.0 POLYGON ((3 5, 5 5, 5 3, 4 3, 4 4, 3 4, 3 5))
>>> geopandas.overlay(df1, df2, how='difference')
geometry df1_data
0 POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0)) 1
1 MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4... 2
>>> geopandas.overlay(df1, df2, how='identity')
df1_data df2_data geometry
0 1.0 1.0 POLYGON ((2 2, 2 1, 1 1, 1 2, 2 2))
1 2.0 1.0 POLYGON ((2 2, 2 3, 3 3, 3 2, 2 2))
2 2.0 2.0 POLYGON ((4 4, 4 3, 3 3, 3 4, 4 4))
3 1.0 NaN POLYGON ((2 0, 0 0, 0 2, 1 2, 1 1, 2 1, 2 0))
4 2.0 NaN MULTIPOLYGON (((3 4, 3 3, 2 3, 2 4, 3 4)), ((4...
See also
--------
sjoin : spatial join
GeoDataFrame.overlay : equivalent method
Notes
-----
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
# Allowed operations
allowed_hows = [
"intersection",
"union",
"identity",
"symmetric_difference",
"difference", # aka erase
]
# Error Messages
if how not in allowed_hows:
raise ValueError(
"`how` was '{0}' but is expected to be in {1}".format(how, allowed_hows)
)
if isinstance(df1, GeoSeries) or isinstance(df2, GeoSeries):
raise NotImplementedError(
"overlay currently only implemented for GeoDataFrames"
)
if not _check_crs(df1, df2):
_crs_mismatch_warn(df1, df2, stacklevel=3)
if keep_geom_type is None:
keep_geom_type = True
keep_geom_type_warning = True
else:
keep_geom_type_warning = False
polys = ["Polygon", "MultiPolygon"]
lines = ["LineString", "MultiLineString", "LinearRing"]
points = ["Point", "MultiPoint"]
for i, df in enumerate([df1, df2]):
poly_check = df.geom_type.isin(polys).any()
lines_check = df.geom_type.isin(lines).any()
points_check = df.geom_type.isin(points).any()
if sum([poly_check, lines_check, points_check]) > 1:
raise NotImplementedError(
"df{} contains mixed geometry types.".format(i + 1)
)
if how == "intersection":
box_gdf1 = df1.total_bounds
box_gdf2 = df2.total_bounds
if not (
((box_gdf1[0] <= box_gdf2[2]) and (box_gdf2[0] <= box_gdf1[2]))
and ((box_gdf1[1] <= box_gdf2[3]) and (box_gdf2[1] <= box_gdf1[3]))
):
result = df1.iloc[:0].merge(
df2.iloc[:0].drop(df2.geometry.name, axis=1),
left_index=True,
right_index=True,
suffixes=("_1", "_2"),
)
return result[
result.columns.drop(df1.geometry.name).tolist() + [df1.geometry.name]
]
# Computations
def _make_valid(df):
df = df.copy()
if df.geom_type.isin(polys).all():
mask = ~df.geometry.is_valid
col = df._geometry_column_name
if make_valid:
df.loc[mask, col] = df.loc[mask, col].make_valid()
elif mask.any():
raise ValueError(
"You have passed make_valid=False along with "
f"{mask.sum()} invalid input geometries. "
"Use make_valid=True or make sure that all geometries "
"are valid before using overlay."
)
return df
df1 = _make_valid(df1)
df2 = _make_valid(df2)
with warnings.catch_warnings(): # CRS checked above, suppress array-level warning
warnings.filterwarnings("ignore", message="CRS mismatch between the CRS")
if how == "difference":
result = _overlay_difference(df1, df2)
elif how == "intersection":
result = _overlay_intersection(df1, df2)
elif how == "symmetric_difference":
result = _overlay_symmetric_diff(df1, df2)
elif how == "union":
result = _overlay_union(df1, df2)
elif how == "identity":
dfunion = _overlay_union(df1, df2)
result = dfunion[dfunion["__idx1"].notnull()].copy()
if how in ["intersection", "symmetric_difference", "union", "identity"]:
result.drop(["__idx1", "__idx2"], axis=1, inplace=True)
if keep_geom_type:
geom_type = df1.geom_type.iloc[0]
# First we filter the geometry types inside GeometryCollections objects
# (e.g. GeometryCollection([polygon, point]) -> polygon)
# we do this separately on only the relevant rows, as this is an expensive
# operation (an expensive no-op for geometry types other than collections)
is_collection = result.geom_type == "GeometryCollection"
if is_collection.any():
geom_col = result._geometry_column_name
collections = result[[geom_col]][is_collection]
exploded = collections.reset_index(drop=True).explode(index_parts=True)
exploded = exploded.reset_index(level=0)
orig_num_geoms_exploded = exploded.shape[0]
if geom_type in polys:
exploded.loc[~exploded.geom_type.isin(polys), geom_col] = None
elif geom_type in lines:
exploded.loc[~exploded.geom_type.isin(lines), geom_col] = None
elif geom_type in points:
exploded.loc[~exploded.geom_type.isin(points), geom_col] = None
else:
raise TypeError(
"`keep_geom_type` does not support {}.".format(geom_type)
)
num_dropped_collection = (
orig_num_geoms_exploded - exploded.geometry.isna().sum()
)
# level_0 created with above reset_index operation
# and represents the original geometry collections
# TODO avoiding dissolve to call union_all in this case could further
# improve performance (we only need to collect geometries in their
# respective Multi version)
dissolved = exploded.dissolve(by="level_0")
result.loc[is_collection, geom_col] = dissolved[geom_col].values
else:
num_dropped_collection = 0
# Now we filter all geometries (in theory we don't need to do this
# again for the rows handled above for GeometryCollections, but filtering
# them out is probably more expensive as simply including them when this
# is typically about only a few rows)
orig_num_geoms = result.shape[0]
if geom_type in polys:
result = result.loc[result.geom_type.isin(polys)]
elif geom_type in lines:
result = result.loc[result.geom_type.isin(lines)]
elif geom_type in points:
result = result.loc[result.geom_type.isin(points)]
else:
raise TypeError("`keep_geom_type` does not support {}.".format(geom_type))
num_dropped = orig_num_geoms - result.shape[0]
if (num_dropped > 0 or num_dropped_collection > 0) and keep_geom_type_warning:
warnings.warn(
"`keep_geom_type=True` in overlay resulted in {} dropped "
"geometries of different geometry types than df1 has. "
"Set `keep_geom_type=False` to retain all "
"geometries".format(num_dropped + num_dropped_collection),
UserWarning,
stacklevel=2,
)
result.reset_index(drop=True, inplace=True)
return result
@@ -0,0 +1,734 @@
import warnings
from functools import partial
from typing import Optional
import numpy as np
import pandas as pd
from geopandas import GeoDataFrame
from geopandas._compat import PANDAS_GE_30
from geopandas.array import _check_crs, _crs_mismatch_warn
def sjoin(
left_df,
right_df,
how="inner",
predicate="intersects",
lsuffix="left",
rsuffix="right",
distance=None,
on_attribute=None,
**kwargs,
):
"""Spatial join of two GeoDataFrames.
See the User Guide page :doc:`../../user_guide/mergingdata` for details.
Parameters
----------
left_df, right_df : GeoDataFrames
how : string, default 'inner'
The type of join:
* 'left': use keys from left_df; retain only left_df geometry column
* 'right': use keys from right_df; retain only right_df geometry column
* 'inner': use intersection of keys from both dfs; retain only
left_df geometry column
predicate : string, default 'intersects'
Binary predicate. Valid values are determined by the spatial index used.
You can check the valid values in left_df or right_df as
``left_df.sindex.valid_query_predicates`` or
``right_df.sindex.valid_query_predicates``
Replaces deprecated ``op`` parameter.
lsuffix : string, default 'left'
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string, default 'right'
Suffix to apply to overlapping column names (right GeoDataFrame).
distance : number or array_like, optional
Distance(s) around each input geometry within which to query the tree
for the 'dwithin' predicate. If array_like, must be
one-dimesional with length equal to length of left GeoDataFrame.
Required if ``predicate='dwithin'``.
on_attribute : string, list or tuple
Column name(s) to join on as an additional join restriction on top
of the spatial predicate. These must be found in both DataFrames.
If set, observations are joined only if the predicate applies
and values in specified columns match.
Examples
--------
>>> import geodatasets
>>> chicago = geopandas.read_file(
... geodatasets.get_path("geoda.chicago_health")
... )
>>> groceries = geopandas.read_file(
... geodatasets.get_path("geoda.groceries")
... ).to_crs(chicago.crs)
>>> chicago.head() # doctest: +SKIP
ComAreaID ... geometry
0 35 ... POLYGON ((-87.60914 41.84469, -87.60915 41.844...
1 36 ... POLYGON ((-87.59215 41.81693, -87.59231 41.816...
2 37 ... POLYGON ((-87.62880 41.80189, -87.62879 41.801...
3 38 ... POLYGON ((-87.60671 41.81681, -87.60670 41.816...
4 39 ... POLYGON ((-87.59215 41.81693, -87.59215 41.816...
[5 rows x 87 columns]
>>> groceries.head() # doctest: +SKIP
OBJECTID Ycoord ... Category geometry
0 16 41.973266 ... NaN MULTIPOINT (-87.65661 41.97321)
1 18 41.696367 ... NaN MULTIPOINT (-87.68136 41.69713)
2 22 41.868634 ... NaN MULTIPOINT (-87.63918 41.86847)
3 23 41.877590 ... new MULTIPOINT (-87.65495 41.87783)
4 27 41.737696 ... NaN MULTIPOINT (-87.62715 41.73623)
[5 rows x 8 columns]
>>> groceries_w_communities = geopandas.sjoin(groceries, chicago)
>>> groceries_w_communities.head() # doctest: +SKIP
OBJECTID community geometry
0 16 UPTOWN MULTIPOINT ((-87.65661 41.97321))
1 18 MORGAN PARK MULTIPOINT ((-87.68136 41.69713))
2 22 NEAR WEST SIDE MULTIPOINT ((-87.63918 41.86847))
3 23 NEAR WEST SIDE MULTIPOINT ((-87.65495 41.87783))
4 27 CHATHAM MULTIPOINT ((-87.62715 41.73623))
[5 rows x 95 columns]
See also
--------
overlay : overlay operation resulting in a new geometry
GeoDataFrame.sjoin : equivalent method
Notes
-----
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
if kwargs:
first = next(iter(kwargs.keys()))
raise TypeError(f"sjoin() got an unexpected keyword argument '{first}'")
on_attribute = _maybe_make_list(on_attribute)
_basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=on_attribute),
indices = _geom_predicate_query(
left_df, right_df, predicate, distance, on_attribute=on_attribute
)
joined, _ = _frame_join(
left_df,
right_df,
indices,
None,
how,
lsuffix,
rsuffix,
predicate,
on_attribute=on_attribute,
)
return joined
def _maybe_make_list(obj):
if isinstance(obj, tuple):
return list(obj)
if obj is not None and not isinstance(obj, list):
return [obj]
return obj
def _basic_checks(left_df, right_df, how, lsuffix, rsuffix, on_attribute=None):
"""Checks the validity of join input parameters.
`how` must be one of the valid options.
`'index_'` concatenated with `lsuffix` or `rsuffix` must not already
exist as columns in the left or right data frames.
Parameters
------------
left_df : GeoDataFrame
right_df : GeoData Frame
how : str, one of 'left', 'right', 'inner'
join type
lsuffix : str
left index suffix
rsuffix : str
right index suffix
on_attribute : list, default None
list of column names to merge on along with geometry
"""
if not isinstance(left_df, GeoDataFrame):
raise ValueError(
"'left_df' should be GeoDataFrame, got {}".format(type(left_df))
)
if not isinstance(right_df, GeoDataFrame):
raise ValueError(
"'right_df' should be GeoDataFrame, got {}".format(type(right_df))
)
allowed_hows = ["left", "right", "inner"]
if how not in allowed_hows:
raise ValueError(
'`how` was "{}" but is expected to be in {}'.format(how, allowed_hows)
)
if not _check_crs(left_df, right_df):
_crs_mismatch_warn(left_df, right_df, stacklevel=4)
if on_attribute:
for attr in on_attribute:
if (attr not in left_df) and (attr not in right_df):
raise ValueError(
f"Expected column {attr} is missing from both of the dataframes."
)
if attr not in left_df:
raise ValueError(
f"Expected column {attr} is missing from the left dataframe."
)
if attr not in right_df:
raise ValueError(
f"Expected column {attr} is missing from the right dataframe."
)
if attr in (left_df.geometry.name, right_df.geometry.name):
raise ValueError(
"Active geometry column cannot be used as an input "
"for on_attribute parameter."
)
def _geom_predicate_query(left_df, right_df, predicate, distance, on_attribute=None):
"""Compute geometric comparisons and get matching indices.
Parameters
----------
left_df : GeoDataFrame
right_df : GeoDataFrame
predicate : string
Binary predicate to query.
on_attribute: list, default None
list of column names to merge on along with geometry
Returns
-------
DataFrame
DataFrame with matching indices in
columns named `_key_left` and `_key_right`.
"""
original_predicate = predicate
if predicate == "within":
# within is implemented as the inverse of contains
# contains is a faster predicate
# see discussion at https://github.com/geopandas/geopandas/pull/1421
predicate = "contains"
sindex = left_df.sindex
input_geoms = right_df.geometry
else:
# all other predicates are symmetric
# keep them the same
sindex = right_df.sindex
input_geoms = left_df.geometry
if sindex:
l_idx, r_idx = sindex.query(
input_geoms, predicate=predicate, sort=False, distance=distance
)
else:
# when sindex is empty / has no valid geometries
l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
if original_predicate == "within":
# within is implemented as the inverse of contains
# flip back the results
r_idx, l_idx = l_idx, r_idx
indexer = np.lexsort((r_idx, l_idx))
l_idx = l_idx[indexer]
r_idx = r_idx[indexer]
if on_attribute:
for attr in on_attribute:
(l_idx, r_idx), _ = _filter_shared_attribute(
left_df, right_df, l_idx, r_idx, attr
)
return l_idx, r_idx
def _reset_index_with_suffix(df, suffix, other):
"""
Equivalent of df.reset_index(), but with adding 'suffix' to auto-generated
column names.
"""
index_original = df.index.names
if PANDAS_GE_30:
df_reset = df.reset_index()
else:
# we already made a copy of the dataframe in _frame_join before getting here
df_reset = df
df_reset.reset_index(inplace=True)
column_names = df_reset.columns.to_numpy(copy=True)
for i, label in enumerate(index_original):
# if the original label was None, add suffix to auto-generated name
if label is None:
new_label = column_names[i]
if "level" in new_label:
# reset_index of MultiIndex gives "level_i" names, preserve the "i"
lev = new_label.split("_")[1]
new_label = f"index_{suffix}{lev}"
else:
new_label = f"index_{suffix}"
# check new label will not be in other dataframe
if new_label in df.columns or new_label in other.columns:
raise ValueError(
"'{0}' cannot be a column name in the frames being"
" joined".format(new_label)
)
column_names[i] = new_label
return df_reset, pd.Index(column_names)
def _process_column_names_with_suffix(
left: pd.Index, right: pd.Index, suffixes, left_df, right_df
):
"""
Add suffixes to overlapping labels (ignoring the geometry column).
This is based on pandas' merge logic at https://github.com/pandas-dev/pandas/blob/
a0779adb183345a8eb4be58b3ad00c223da58768/pandas/core/reshape/merge.py#L2300-L2370
"""
to_rename = left.intersection(right)
if len(to_rename) == 0:
return left, right
lsuffix, rsuffix = suffixes
if not lsuffix and not rsuffix:
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
def renamer(x, suffix, geometry):
if x in to_rename and x != geometry and suffix is not None:
return f"{x}_{suffix}"
return x
lrenamer = partial(
renamer,
suffix=lsuffix,
geometry=getattr(left_df, "_geometry_column_name", None),
)
rrenamer = partial(
renamer,
suffix=rsuffix,
geometry=getattr(right_df, "_geometry_column_name", None),
)
# TODO retain index name?
left_renamed = pd.Index([lrenamer(lab) for lab in left])
right_renamed = pd.Index([rrenamer(lab) for lab in right])
dups = []
if not left_renamed.is_unique:
# Only warn when duplicates are caused because of suffixes, already duplicated
# columns in origin should not warn
dups = left_renamed[(left_renamed.duplicated()) & (~left.duplicated())].tolist()
if not right_renamed.is_unique:
dups.extend(
right_renamed[(right_renamed.duplicated()) & (~right.duplicated())].tolist()
)
# TODO turn this into an error (pandas has done so as well)
if dups:
warnings.warn(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} in the "
f"result is deprecated and will raise a MergeError in a future version.",
FutureWarning,
stacklevel=4,
)
return left_renamed, right_renamed
def _restore_index(joined, index_names, index_names_original):
"""
Set back the the original index columns, and restoring their name as `None`
if they didn't have a name originally.
"""
if PANDAS_GE_30:
joined = joined.set_index(list(index_names))
else:
joined.set_index(list(index_names), inplace=True)
# restore the fact that the index didn't have a name
joined_index_names = list(joined.index.names)
for i, label in enumerate(index_names_original):
if label is None:
joined_index_names[i] = None
joined.index.names = joined_index_names
return joined
def _adjust_indexers(indices, distances, original_length, how, predicate):
"""
The left/right indexers from the query represents an inner join.
For a left or right join, we need to adjust them to include the rows
that would not be present in an inner join.
"""
# the indices represent an inner join, no adjustment needed
if how == "inner":
return indices, distances
l_idx, r_idx = indices
if how == "right":
# re-sort so it is sorted by the right indexer
indexer = np.lexsort((l_idx, r_idx))
l_idx, r_idx = l_idx[indexer], r_idx[indexer]
if distances is not None:
distances = distances[indexer]
# switch order
r_idx, l_idx = l_idx, r_idx
# determine which indices are missing and where they would need to be inserted
idx = np.arange(original_length)
l_idx_missing = idx[~np.isin(idx, l_idx)]
insert_idx = np.searchsorted(l_idx, l_idx_missing)
# for the left indexer, insert those missing indices
l_idx = np.insert(l_idx, insert_idx, l_idx_missing)
# for the right indexer, insert -1 -> to get missing values in pandas' reindexing
r_idx = np.insert(r_idx, insert_idx, -1)
# for the indices, already insert those missing values manually
if distances is not None:
distances = np.insert(distances, insert_idx, np.nan)
if how == "right":
# switch back
l_idx, r_idx = r_idx, l_idx
return (l_idx, r_idx), distances
def _frame_join(
left_df,
right_df,
indices,
distances,
how,
lsuffix,
rsuffix,
predicate,
on_attribute=None,
):
"""Join the GeoDataFrames at the DataFrame level.
Parameters
----------
left_df : GeoDataFrame
right_df : GeoDataFrame
indices : tuple of ndarray
Indices returned by the geometric join. Tuple with with integer
indices representing the matches from `left_df` and `right_df`
respectively.
distances : ndarray, optional
Passed trough and adapted based on the indices, if needed.
how : string
The type of join to use on the DataFrame level.
lsuffix : string
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string
Suffix to apply to overlapping column names (right GeoDataFrame).
on_attribute: list, default None
list of column names to merge on along with geometry
Returns
-------
GeoDataFrame
Joined GeoDataFrame.
"""
if on_attribute: # avoid renaming or duplicating shared column
right_df = right_df.drop(on_attribute, axis=1)
if how in ("inner", "left"):
right_df = right_df.drop(right_df.geometry.name, axis=1)
else: # how == 'right':
left_df = left_df.drop(left_df.geometry.name, axis=1)
left_df = left_df.copy(deep=False)
left_nlevels = left_df.index.nlevels
left_index_original = left_df.index.names
left_df, left_column_names = _reset_index_with_suffix(left_df, lsuffix, right_df)
right_df = right_df.copy(deep=False)
right_nlevels = right_df.index.nlevels
right_index_original = right_df.index.names
right_df, right_column_names = _reset_index_with_suffix(right_df, rsuffix, left_df)
# if conflicting names in left and right, add suffix
left_column_names, right_column_names = _process_column_names_with_suffix(
left_column_names,
right_column_names,
(lsuffix, rsuffix),
left_df,
right_df,
)
left_df.columns = left_column_names
right_df.columns = right_column_names
left_index = left_df.columns[:left_nlevels]
right_index = right_df.columns[:right_nlevels]
# perform join on the dataframes
original_length = len(right_df) if how == "right" else len(left_df)
(l_idx, r_idx), distances = _adjust_indexers(
indices, distances, original_length, how, predicate
)
# the `take` method doesn't allow introducing NaNs with -1 indices
# left = left_df.take(l_idx)
# therefore we are using the private _reindex_with_indexers as workaround
new_index = pd.RangeIndex(len(l_idx))
left = left_df._reindex_with_indexers({0: (new_index, l_idx)})
right = right_df._reindex_with_indexers({0: (new_index, r_idx)})
if PANDAS_GE_30:
kwargs = {}
else:
kwargs = dict(copy=False)
joined = pd.concat([left, right], axis=1, **kwargs)
if how in ("inner", "left"):
joined = _restore_index(joined, left_index, left_index_original)
else: # how == 'right':
joined = joined.set_geometry(right_df.geometry.name)
joined = _restore_index(joined, right_index, right_index_original)
return joined, distances
def _nearest_query(
left_df: GeoDataFrame,
right_df: GeoDataFrame,
max_distance: float,
how: str,
return_distance: bool,
exclusive: bool,
on_attribute: Optional[list] = None,
):
# use the opposite of the join direction for the index
use_left_as_sindex = how == "right"
if use_left_as_sindex:
sindex = left_df.sindex
query = right_df.geometry
else:
sindex = right_df.sindex
query = left_df.geometry
if sindex:
res = sindex.nearest(
query,
return_all=True,
max_distance=max_distance,
return_distance=return_distance,
exclusive=exclusive,
)
if return_distance:
(input_idx, tree_idx), distances = res
else:
(input_idx, tree_idx) = res
distances = None
if use_left_as_sindex:
l_idx, r_idx = tree_idx, input_idx
sort_order = np.argsort(l_idx, kind="stable")
l_idx, r_idx = l_idx[sort_order], r_idx[sort_order]
if distances is not None:
distances = distances[sort_order]
else:
l_idx, r_idx = input_idx, tree_idx
else:
# when sindex is empty / has no valid geometries
l_idx, r_idx = np.array([], dtype=np.intp), np.array([], dtype=np.intp)
if return_distance:
distances = np.array([], dtype=np.float64)
else:
distances = None
if on_attribute:
for attr in on_attribute:
(l_idx, r_idx), shared_attribute_rows = _filter_shared_attribute(
left_df, right_df, l_idx, r_idx, attr
)
distances = distances[shared_attribute_rows]
return (l_idx, r_idx), distances
def _filter_shared_attribute(left_df, right_df, l_idx, r_idx, attribute):
"""
Returns the indices for the left and right dataframe that share the same entry
in the attribute column. Also returns a Boolean `shared_attribute_rows` for rows
with the same entry.
"""
shared_attribute_rows = (
left_df[attribute].iloc[l_idx].values == right_df[attribute].iloc[r_idx].values
)
l_idx = l_idx[shared_attribute_rows]
r_idx = r_idx[shared_attribute_rows]
return (l_idx, r_idx), shared_attribute_rows
def sjoin_nearest(
left_df: GeoDataFrame,
right_df: GeoDataFrame,
how: str = "inner",
max_distance: Optional[float] = None,
lsuffix: str = "left",
rsuffix: str = "right",
distance_col: Optional[str] = None,
exclusive: bool = False,
) -> GeoDataFrame:
"""Spatial join of two GeoDataFrames based on the distance between their geometries.
Results will include multiple output records for a single input record
where there are multiple equidistant nearest or intersected neighbors.
Distance is calculated in CRS units and can be returned using the
`distance_col` parameter.
See the User Guide page
https://geopandas.readthedocs.io/en/latest/docs/user_guide/mergingdata.html
for more details.
Parameters
----------
left_df, right_df : GeoDataFrames
how : string, default 'inner'
The type of join:
* 'left': use keys from left_df; retain only left_df geometry column
* 'right': use keys from right_df; retain only right_df geometry column
* 'inner': use intersection of keys from both dfs; retain only
left_df geometry column
max_distance : float, default None
Maximum distance within which to query for nearest geometry.
Must be greater than 0.
The max_distance used to search for nearest items in the tree may have a
significant impact on performance by reducing the number of input
geometries that are evaluated for nearest items in the tree.
lsuffix : string, default 'left'
Suffix to apply to overlapping column names (left GeoDataFrame).
rsuffix : string, default 'right'
Suffix to apply to overlapping column names (right GeoDataFrame).
distance_col : string, default None
If set, save the distances computed between matching geometries under a
column of this name in the joined GeoDataFrame.
exclusive : bool, default False
If True, the nearest geometries that are equal to the input geometry
will not be returned, default False.
Examples
--------
>>> import geodatasets
>>> groceries = geopandas.read_file(
... geodatasets.get_path("geoda.groceries")
... )
>>> chicago = geopandas.read_file(
... geodatasets.get_path("geoda.chicago_health")
... ).to_crs(groceries.crs)
>>> chicago.head() # doctest: +SKIP
ComAreaID ... geometry
0 35 ... POLYGON ((-87.60914 41.84469, -87.60915 41.844...
1 36 ... POLYGON ((-87.59215 41.81693, -87.59231 41.816...
2 37 ... POLYGON ((-87.62880 41.80189, -87.62879 41.801...
3 38 ... POLYGON ((-87.60671 41.81681, -87.60670 41.816...
4 39 ... POLYGON ((-87.59215 41.81693, -87.59215 41.816...
[5 rows x 87 columns]
>>> groceries.head() # doctest: +SKIP
OBJECTID Ycoord ... Category geometry
0 16 41.973266 ... NaN MULTIPOINT ((-87.65661 41.97321))
1 18 41.696367 ... NaN MULTIPOINT ((-87.68136 41.69713))
2 22 41.868634 ... NaN MULTIPOINT ((-87.63918 41.86847))
3 23 41.877590 ... new MULTIPOINT ((-87.65495 41.87783))
4 27 41.737696 ... NaN MULTIPOINT ((-87.62715 41.73623))
[5 rows x 8 columns]
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago)
>>> groceries_w_communities[["Chain", "community", "geometry"]].head(2)
Chain community geometry
0 VIET HOA PLAZA UPTOWN MULTIPOINT ((1168268.672 1933554.35))
1 COUNTY FAIR FOODS MORGAN PARK MULTIPOINT ((1162302.618 1832900.224))
To include the distances:
>>> groceries_w_communities = geopandas.sjoin_nearest(groceries, chicago, \
distance_col="distances")
>>> groceries_w_communities[["Chain", "community", \
"distances"]].head(2)
Chain community distances
0 VIET HOA PLAZA UPTOWN 0.0
1 COUNTY FAIR FOODS MORGAN PARK 0.0
In the following example, we get multiple groceries for Uptown because all
results are equidistant (in this case zero because they intersect).
In fact, we get 4 results in total:
>>> chicago_w_groceries = geopandas.sjoin_nearest(groceries, chicago, \
distance_col="distances", how="right")
>>> uptown_results = \
chicago_w_groceries[chicago_w_groceries["community"] == "UPTOWN"]
>>> uptown_results[["Chain", "community"]]
Chain community
30 VIET HOA PLAZA UPTOWN
30 JEWEL OSCO UPTOWN
30 TARGET UPTOWN
30 Mariano's UPTOWN
See also
--------
sjoin : binary predicate joins
GeoDataFrame.sjoin_nearest : equivalent method
Notes
-----
Since this join relies on distances, results will be inaccurate
if your geometries are in a geographic CRS.
Every operation in GeoPandas is planar, i.e. the potential third
dimension is not taken into account.
"""
_basic_checks(left_df, right_df, how, lsuffix, rsuffix)
left_df.geometry.values.check_geographic_crs(stacklevel=1)
right_df.geometry.values.check_geographic_crs(stacklevel=1)
return_distance = distance_col is not None
indices, distances = _nearest_query(
left_df,
right_df,
max_distance,
how,
return_distance,
exclusive,
)
joined, distances = _frame_join(
left_df,
right_df,
indices,
distances,
how,
lsuffix,
rsuffix,
None,
)
if return_distance:
joined[distance_col] = distances
return joined
@@ -0,0 +1,484 @@
"""Tests for the clip module."""
import numpy as np
import pandas as pd
import shapely
from shapely.geometry import (
GeometryCollection,
LinearRing,
LineString,
MultiPoint,
Point,
Polygon,
box,
)
import geopandas
from geopandas import GeoDataFrame, GeoSeries, clip
from geopandas._compat import HAS_PYPROJ
from geopandas.tools.clip import _mask_is_list_like_rectangle
import pytest
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
from pandas.testing import assert_index_equal
mask_variants_single_rectangle = [
"single_rectangle_gdf",
"single_rectangle_gdf_list_bounds",
"single_rectangle_gdf_tuple_bounds",
"single_rectangle_gdf_array_bounds",
]
mask_variants_large_rectangle = [
"larger_single_rectangle_gdf",
"larger_single_rectangle_gdf_bounds",
]
@pytest.fixture
def point_gdf():
"""Create a point GeoDataFrame."""
pts = np.array([[2, 2], [3, 4], [9, 8], [-12, -15]])
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
return gdf
@pytest.fixture
def point_gdf2():
"""Create a point GeoDataFrame."""
pts = np.array([[5, 5], [2, 2], [4, 4], [0, 0], [3, 3], [1, 1]])
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
return gdf
@pytest.fixture
def pointsoutside_nooverlap_gdf():
"""Create a point GeoDataFrame. Its points are all outside the single
rectangle, and its bounds are outside the single rectangle's."""
pts = np.array([[5, 15], [15, 15], [15, 20]])
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
return gdf
@pytest.fixture
def pointsoutside_overlap_gdf():
"""Create a point GeoDataFrame. Its points are all outside the single
rectangle, and its bounds are overlapping the single rectangle's."""
pts = np.array([[5, 15], [15, 15], [15, 5]])
gdf = GeoDataFrame([Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857")
return gdf
@pytest.fixture
def single_rectangle_gdf():
"""Create a single rectangle for clipping."""
poly_inters = Polygon([(0, 0), (0, 10), (10, 10), (10, 0), (0, 0)])
gdf = GeoDataFrame([1], geometry=[poly_inters], crs="EPSG:3857")
gdf["attr2"] = "site-boundary"
return gdf
@pytest.fixture
def single_rectangle_gdf_tuple_bounds(single_rectangle_gdf):
"""Bounds of the created single rectangle"""
return tuple(single_rectangle_gdf.total_bounds)
@pytest.fixture
def single_rectangle_gdf_list_bounds(single_rectangle_gdf):
"""Bounds of the created single rectangle"""
return list(single_rectangle_gdf.total_bounds)
@pytest.fixture
def single_rectangle_gdf_array_bounds(single_rectangle_gdf):
"""Bounds of the created single rectangle"""
return single_rectangle_gdf.total_bounds
@pytest.fixture
def larger_single_rectangle_gdf():
"""Create a slightly larger rectangle for clipping.
The smaller single rectangle is used to test the edge case where slivers
are returned when you clip polygons. This fixture is larger which
eliminates the slivers in the clip return.
"""
poly_inters = Polygon([(-5, -5), (-5, 15), (15, 15), (15, -5), (-5, -5)])
gdf = GeoDataFrame([1], geometry=[poly_inters], crs="EPSG:3857")
gdf["attr2"] = ["study area"]
return gdf
@pytest.fixture
def larger_single_rectangle_gdf_bounds(larger_single_rectangle_gdf):
"""Bounds of the created single rectangle"""
return tuple(larger_single_rectangle_gdf.total_bounds)
@pytest.fixture
def buffered_locations(point_gdf):
"""Buffer points to create a multi-polygon."""
buffered_locs = point_gdf
buffered_locs["geometry"] = buffered_locs.buffer(4)
buffered_locs["type"] = "plot"
return buffered_locs
@pytest.fixture
def donut_geometry(buffered_locations, single_rectangle_gdf):
"""Make a geometry with a hole in the middle (a donut)."""
donut = geopandas.overlay(
buffered_locations, single_rectangle_gdf, how="symmetric_difference"
)
return donut
@pytest.fixture
def two_line_gdf():
"""Create Line Objects For Testing"""
linea = LineString([(1, 1), (2, 2), (3, 2), (5, 3)])
lineb = LineString([(3, 4), (5, 7), (12, 2), (10, 5), (9, 7.5)])
gdf = GeoDataFrame([1, 2], geometry=[linea, lineb], crs="EPSG:3857")
return gdf
@pytest.fixture
def multi_poly_gdf(donut_geometry):
"""Create a multi-polygon GeoDataFrame."""
multi_poly = donut_geometry.union_all()
out_df = GeoDataFrame(geometry=GeoSeries(multi_poly), crs="EPSG:3857")
out_df["attr"] = ["pool"]
return out_df
@pytest.fixture
def multi_line(two_line_gdf):
"""Create a multi-line GeoDataFrame.
This GDF has one multiline and one regular line."""
# Create a single and multi line object
multiline_feat = two_line_gdf.union_all()
linec = LineString([(2, 1), (3, 1), (4, 1), (5, 2)])
out_df = GeoDataFrame(geometry=GeoSeries([multiline_feat, linec]), crs="EPSG:3857")
out_df["attr"] = ["road", "stream"]
return out_df
@pytest.fixture
def multi_point(point_gdf):
"""Create a multi-point GeoDataFrame."""
multi_point = point_gdf.union_all()
out_df = GeoDataFrame(
geometry=GeoSeries(
[multi_point, Point(2, 5), Point(-11, -14), Point(-10, -12)]
),
crs="EPSG:3857",
)
out_df["attr"] = ["tree", "another tree", "shrub", "berries"]
return out_df
@pytest.fixture
def mixed_gdf():
"""Create a Mixed Polygon and LineString For Testing"""
point = Point(2, 3)
line = LineString([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
poly = Polygon([(3, 4), (5, 2), (12, 2), (10, 5), (9, 7.5)])
ring = LinearRing([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
gdf = GeoDataFrame(
[1, 2, 3, 4], geometry=[point, poly, line, ring], crs="EPSG:3857"
)
return gdf
@pytest.fixture
def geomcol_gdf():
"""Create a Mixed Polygon and LineString For Testing"""
point = Point(2, 3)
poly = Polygon([(3, 4), (5, 2), (12, 2), (10, 5), (9, 7.5)])
coll = GeometryCollection([point, poly])
gdf = GeoDataFrame([1], geometry=[coll], crs="EPSG:3857")
return gdf
@pytest.fixture
def sliver_line():
"""Create a line that will create a point when clipped."""
linea = LineString([(10, 5), (13, 5), (15, 5)])
lineb = LineString([(1, 1), (2, 2), (3, 2), (5, 3), (12, 1)])
gdf = GeoDataFrame([1, 2], geometry=[linea, lineb], crs="EPSG:3857")
return gdf
def test_not_gdf(single_rectangle_gdf):
"""Non-GeoDataFrame inputs raise attribute errors."""
with pytest.raises(TypeError):
clip((2, 3), single_rectangle_gdf)
with pytest.raises(TypeError):
clip(single_rectangle_gdf, "foobar")
with pytest.raises(TypeError):
clip(single_rectangle_gdf, (1, 2, 3))
with pytest.raises(TypeError):
clip(single_rectangle_gdf, (1, 2, 3, 4, 5))
def test_non_overlapping_geoms():
"""Test that a bounding box returns empty if the extents don't overlap"""
unit_box = Polygon([(0, 0), (0, 1), (1, 1), (1, 0), (0, 0)])
unit_gdf = GeoDataFrame([1], geometry=[unit_box], crs="EPSG:3857")
non_overlapping_gdf = unit_gdf.copy()
non_overlapping_gdf = non_overlapping_gdf.geometry.apply(
lambda x: shapely.affinity.translate(x, xoff=20)
)
out = clip(unit_gdf, non_overlapping_gdf)
assert_geodataframe_equal(out, unit_gdf.iloc[:0])
out2 = clip(unit_gdf.geometry, non_overlapping_gdf)
assert_geoseries_equal(out2, GeoSeries(crs=unit_gdf.crs))
@pytest.mark.parametrize("mask_fixture_name", mask_variants_single_rectangle)
class TestClipWithSingleRectangleGdf:
@pytest.fixture
def mask(self, mask_fixture_name, request):
return request.getfixturevalue(mask_fixture_name)
def test_returns_gdf(self, point_gdf, mask):
"""Test that function returns a GeoDataFrame (or GDF-like) object."""
out = clip(point_gdf, mask)
assert isinstance(out, GeoDataFrame)
def test_returns_series(self, point_gdf, mask):
"""Test that function returns a GeoSeries if GeoSeries is passed."""
out = clip(point_gdf.geometry, mask)
assert isinstance(out, GeoSeries)
def test_clip_points(self, point_gdf, mask):
"""Test clipping a points GDF with a generic polygon geometry."""
clip_pts = clip(point_gdf, mask)
pts = np.array([[2, 2], [3, 4], [9, 8]])
exp = GeoDataFrame(
[Point(xy) for xy in pts], columns=["geometry"], crs="EPSG:3857"
)
assert_geodataframe_equal(clip_pts, exp)
def test_clip_points_geom_col_rename(self, point_gdf, mask):
"""Test clipping a points GDF with a generic polygon geometry."""
point_gdf_geom_col_rename = point_gdf.rename_geometry("geometry2")
clip_pts = clip(point_gdf_geom_col_rename, mask)
pts = np.array([[2, 2], [3, 4], [9, 8]])
exp = GeoDataFrame(
[Point(xy) for xy in pts],
columns=["geometry2"],
crs="EPSG:3857",
geometry="geometry2",
)
assert_geodataframe_equal(clip_pts, exp)
def test_clip_poly(self, buffered_locations, mask):
"""Test clipping a polygon GDF with a generic polygon geometry."""
clipped_poly = clip(buffered_locations, mask)
assert len(clipped_poly.geometry) == 3
assert all(clipped_poly.geom_type == "Polygon")
def test_clip_poly_geom_col_rename(self, buffered_locations, mask):
"""Test clipping a polygon GDF with a generic polygon geometry."""
poly_gdf_geom_col_rename = buffered_locations.rename_geometry("geometry2")
clipped_poly = clip(poly_gdf_geom_col_rename, mask)
assert len(clipped_poly.geometry) == 3
assert "geometry" not in clipped_poly.keys()
assert "geometry2" in clipped_poly.keys()
def test_clip_poly_series(self, buffered_locations, mask):
"""Test clipping a polygon GDF with a generic polygon geometry."""
clipped_poly = clip(buffered_locations.geometry, mask)
assert len(clipped_poly) == 3
assert all(clipped_poly.geom_type == "Polygon")
def test_clip_multipoly_keep_geom_type(self, multi_poly_gdf, mask):
"""Test a multi poly object where the return includes a sliver.
Also the bounds of the object should == the bounds of the clip object
if they fully overlap (as they do in these fixtures)."""
clipped = clip(multi_poly_gdf, mask, keep_geom_type=True)
expected_bounds = (
mask if _mask_is_list_like_rectangle(mask) else mask.total_bounds
)
assert np.array_equal(clipped.total_bounds, expected_bounds)
# Assert returned data is a not geometry collection
assert (clipped.geom_type.isin(["Polygon", "MultiPolygon"])).all()
def test_clip_multiline(self, multi_line, mask):
"""Test that clipping a multiline feature with a poly returns expected
output."""
clipped = clip(multi_line, mask)
assert clipped.geom_type[0] == "MultiLineString"
def test_clip_multipoint(self, multi_point, mask):
"""Clipping a multipoint feature with a polygon works as expected.
should return a geodataframe with a single multi point feature"""
clipped = clip(multi_point, mask)
assert clipped.geom_type[0] == "MultiPoint"
assert hasattr(clipped, "attr")
# All points should intersect the clip geom
assert len(clipped) == 2
clipped_mutltipoint = MultiPoint(
[
Point(2, 2),
Point(3, 4),
Point(9, 8),
]
)
assert clipped.iloc[0].geometry.wkt == clipped_mutltipoint.wkt
shape_for_points = (
box(*mask) if _mask_is_list_like_rectangle(mask) else mask.union_all()
)
assert all(clipped.intersects(shape_for_points))
def test_clip_lines(self, two_line_gdf, mask):
"""Test what happens when you give the clip_extent a line GDF."""
clip_line = clip(two_line_gdf, mask)
assert len(clip_line.geometry) == 2
def test_mixed_geom(self, mixed_gdf, mask):
"""Test clipping a mixed GeoDataFrame"""
clipped = clip(mixed_gdf, mask)
assert (
clipped.geom_type[0] == "Point"
and clipped.geom_type[1] == "Polygon"
and clipped.geom_type[2] == "LineString"
)
def test_mixed_series(self, mixed_gdf, mask):
"""Test clipping a mixed GeoSeries"""
clipped = clip(mixed_gdf.geometry, mask)
assert (
clipped.geom_type[0] == "Point"
and clipped.geom_type[1] == "Polygon"
and clipped.geom_type[2] == "LineString"
)
def test_clip_with_line_extra_geom(self, sliver_line, mask):
"""When the output of a clipped line returns a geom collection,
and keep_geom_type is True, no geometry collections should be returned."""
clipped = clip(sliver_line, mask, keep_geom_type=True)
assert len(clipped.geometry) == 1
# Assert returned data is a not geometry collection
assert not (clipped.geom_type == "GeometryCollection").any()
def test_clip_no_box_overlap(self, pointsoutside_nooverlap_gdf, mask):
"""Test clip when intersection is empty and boxes do not overlap."""
clipped = clip(pointsoutside_nooverlap_gdf, mask)
assert len(clipped) == 0
def test_clip_box_overlap(self, pointsoutside_overlap_gdf, mask):
"""Test clip when intersection is empty and boxes do overlap."""
clipped = clip(pointsoutside_overlap_gdf, mask)
assert len(clipped) == 0
def test_warning_extra_geoms_mixed(self, mixed_gdf, mask):
"""Test the correct warnings are raised if keep_geom_type is
called on a mixed GDF"""
with pytest.warns(UserWarning):
clip(mixed_gdf, mask, keep_geom_type=True)
def test_warning_geomcoll(self, geomcol_gdf, mask):
"""Test the correct warnings are raised if keep_geom_type is
called on a GDF with GeometryCollection"""
with pytest.warns(UserWarning):
clip(geomcol_gdf, mask, keep_geom_type=True)
def test_clip_line_keep_slivers(sliver_line, single_rectangle_gdf):
"""Test the correct output if a point is returned
from a line only geometry type."""
clipped = clip(sliver_line, single_rectangle_gdf)
# Assert returned data is a geometry collection given sliver geoms
assert "Point" == clipped.geom_type[0]
assert "LineString" == clipped.geom_type[1]
def test_clip_multipoly_keep_slivers(multi_poly_gdf, single_rectangle_gdf):
"""Test a multi poly object where the return includes a sliver.
Also the bounds of the object should == the bounds of the clip object
if they fully overlap (as they do in these fixtures)."""
clipped = clip(multi_poly_gdf, single_rectangle_gdf)
assert np.array_equal(clipped.total_bounds, single_rectangle_gdf.total_bounds)
# Assert returned data is a geometry collection given sliver geoms
assert "GeometryCollection" in clipped.geom_type[0]
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not available")
def test_warning_crs_mismatch(point_gdf, single_rectangle_gdf):
with pytest.warns(UserWarning, match="CRS mismatch between the CRS"):
clip(point_gdf, single_rectangle_gdf.to_crs(4326))
def test_clip_with_polygon(single_rectangle_gdf):
"""Test clip when using a shapely object"""
polygon = Polygon([(0, 0), (5, 12), (10, 0), (0, 0)])
clipped = clip(single_rectangle_gdf, polygon)
exp_poly = polygon.intersection(
Polygon([(0, 0), (0, 10), (10, 10), (10, 0), (0, 0)])
)
exp = GeoDataFrame([1], geometry=[exp_poly], crs="EPSG:3857")
exp["attr2"] = "site-boundary"
assert_geodataframe_equal(clipped, exp)
def test_clip_with_multipolygon(buffered_locations, single_rectangle_gdf):
"""Test clipping a polygon with a multipolygon."""
multi = buffered_locations.dissolve(by="type").reset_index()
clipped = clip(single_rectangle_gdf, multi)
assert clipped.geom_type[0] == "Polygon"
@pytest.mark.parametrize(
"mask_fixture_name",
mask_variants_large_rectangle,
)
def test_clip_single_multipoly_no_extra_geoms(
buffered_locations, mask_fixture_name, request
):
"""When clipping a multi-polygon feature, no additional geom types
should be returned."""
masks = request.getfixturevalue(mask_fixture_name)
multi = buffered_locations.dissolve(by="type").reset_index()
clipped = clip(multi, masks)
assert clipped.geom_type[0] == "Polygon"
@pytest.mark.filterwarnings("ignore:All-NaN slice encountered")
@pytest.mark.parametrize(
"mask",
[
Polygon(),
(np.nan,) * 4,
(np.nan, 0, np.nan, 1),
GeoSeries([Polygon(), Polygon()], crs="EPSG:3857"),
GeoSeries([Polygon(), Polygon()], crs="EPSG:3857").to_frame(),
GeoSeries([], crs="EPSG:3857"),
GeoSeries([], crs="EPSG:3857").to_frame(),
],
)
def test_clip_empty_mask(buffered_locations, mask):
"""Test that clipping with empty mask returns an empty result."""
clipped = clip(buffered_locations, mask)
assert_geodataframe_equal(
clipped,
GeoDataFrame([], columns=["geometry", "type"], crs="EPSG:3857"),
check_index_type=False,
)
clipped = clip(buffered_locations.geometry, mask)
assert_geoseries_equal(clipped, GeoSeries([], crs="EPSG:3857"))
def test_clip_sorting(point_gdf2):
"""Test the sorting kwarg in clip"""
bbox = shapely.geometry.box(0, 0, 2, 2)
unsorted_clipped_gdf = point_gdf2.clip(bbox)
sorted_clipped_gdf = point_gdf2.clip(bbox, sort=True)
expected_sorted_index = pd.Index([1, 3, 5])
assert not (sorted(unsorted_clipped_gdf.index) == unsorted_clipped_gdf.index).all()
assert (sorted(sorted_clipped_gdf.index) == sorted_clipped_gdf.index).all()
assert_index_equal(expected_sorted_index, sorted_clipped_gdf.index)
@@ -0,0 +1,76 @@
import numpy as np
from shapely.geometry import Point
from shapely.wkt import loads
import geopandas
import pytest
from pandas.testing import assert_series_equal
def test_hilbert_distance():
# test the actual Hilbert Code algorithm against some hardcoded values
geoms = geopandas.GeoSeries.from_wkt(
[
"POINT (0 0)",
"POINT (1 1)",
"POINT (1 0)",
"POLYGON ((0 0, 0 1, 1 1, 1 0, 0 0))",
]
)
result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=2)
assert result.tolist() == [0, 10, 15, 2]
result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=3)
assert result.tolist() == [0, 42, 63, 10]
result = geoms.hilbert_distance(total_bounds=(0, 0, 1, 1), level=16)
assert result.tolist() == [0, 2863311530, 4294967295, 715827882]
@pytest.fixture
def geoseries_points():
p1 = Point(1, 2)
p2 = Point(2, 3)
p3 = Point(3, 4)
p4 = Point(4, 1)
return geopandas.GeoSeries([p1, p2, p3, p4])
def test_hilbert_distance_level(geoseries_points):
with pytest.raises(ValueError):
geoseries_points.hilbert_distance(level=20)
def test_specified_total_bounds(geoseries_points):
result = geoseries_points.hilbert_distance(
total_bounds=geoseries_points.total_bounds
)
expected = geoseries_points.hilbert_distance()
assert_series_equal(result, expected)
@pytest.mark.parametrize(
"empty",
[
None,
loads("POLYGON EMPTY"),
],
)
def test_empty(geoseries_points, empty):
s = geoseries_points
s.iloc[-1] = empty
with pytest.raises(
ValueError, match="cannot be computed on a GeoSeries with empty"
):
s.hilbert_distance()
def test_zero_width():
# special case of all points on the same line -> avoid warnings because
# of division by 0 and introducing NaN
s = geopandas.GeoSeries([Point(0, 0), Point(0, 2), Point(0, 1)])
with np.errstate(all="raise"):
result = s.hilbert_distance()
assert np.array(result).argsort().tolist() == [0, 2, 1]
@@ -0,0 +1,67 @@
import numpy
import geopandas
from geopandas.tools._random import uniform
import pytest
@pytest.fixture
def multipolygons(nybb_filename):
return geopandas.read_file(nybb_filename).geometry
@pytest.fixture
def polygons(multipolygons):
return multipolygons.explode(ignore_index=True).geometry
@pytest.fixture
def multilinestrings(multipolygons):
return multipolygons.boundary
@pytest.fixture
def linestrings(polygons):
return polygons.boundary
@pytest.fixture
def points(multipolygons):
return multipolygons.centroid
@pytest.mark.parametrize("size", [10, 100])
@pytest.mark.parametrize(
"geom_fixture", ["multipolygons", "polygons", "multilinestrings", "linestrings"]
)
def test_uniform(geom_fixture, size, request):
geom = request.getfixturevalue(geom_fixture)[0]
sample = uniform(geom, size=size, rng=1)
sample_series = (
geopandas.GeoSeries(sample).explode(index_parts=True).reset_index(drop=True)
)
assert len(sample_series) == size
sample_in_geom = sample_series.buffer(0.00000001).sindex.query(
geom, predicate="intersects"
)
assert len(sample_in_geom) == size
def test_uniform_unsupported(points):
with pytest.warns(UserWarning, match="Sampling is not supported"):
sample = uniform(points[0], size=10, rng=1)
assert sample.is_empty
def test_uniform_generator(polygons):
sample = uniform(polygons[0], size=10, rng=1)
sample2 = uniform(polygons[0], size=10, rng=1)
assert sample.equals(sample2)
generator = numpy.random.default_rng(seed=1)
gen_sample = uniform(polygons[0], size=10, rng=generator)
gen_sample2 = uniform(polygons[0], size=10, rng=generator)
assert sample.equals(gen_sample)
assert not sample.equals(gen_sample2)
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,51 @@
from shapely.geometry import LineString, MultiPoint, Point
from geopandas import GeoSeries
from geopandas.tools import collect
import pytest
class TestTools:
def setup_method(self):
self.p1 = Point(0, 0)
self.p2 = Point(1, 1)
self.p3 = Point(2, 2)
self.mpc = MultiPoint([self.p1, self.p2, self.p3])
self.mp1 = MultiPoint([self.p1, self.p2])
self.line1 = LineString([(3, 3), (4, 4)])
def test_collect_single(self):
result = collect(self.p1)
assert self.p1.equals(result)
def test_collect_single_force_multi(self):
result = collect(self.p1, multi=True)
expected = MultiPoint([self.p1])
assert expected.equals(result)
def test_collect_multi(self):
result = collect(self.mp1)
assert self.mp1.equals(result)
def test_collect_multi_force_multi(self):
result = collect(self.mp1)
assert self.mp1.equals(result)
def test_collect_list(self):
result = collect([self.p1, self.p2, self.p3])
assert self.mpc.equals(result)
def test_collect_GeoSeries(self):
s = GeoSeries([self.p1, self.p2, self.p3])
result = collect(s)
assert self.mpc.equals(result)
def test_collect_mixed_types(self):
with pytest.raises(ValueError):
collect([self.p1, self.line1])
def test_collect_mixed_multi(self):
with pytest.raises(ValueError):
collect([self.mpc, self.mp1])
@@ -0,0 +1,45 @@
import pandas as pd
from shapely.geometry import MultiLineString, MultiPoint, MultiPolygon
from shapely.geometry.base import BaseGeometry
_multi_type_map = {
"Point": MultiPoint,
"LineString": MultiLineString,
"Polygon": MultiPolygon,
}
def collect(x, multi=False):
"""
Collect single part geometries into their Multi* counterpart
Parameters
----------
x : an iterable or Series of Shapely geometries, a GeoSeries, or
a single Shapely geometry
multi : boolean, default False
if True, force returned geometries to be Multi* even if they
only have one component.
"""
if isinstance(x, BaseGeometry):
x = [x]
elif isinstance(x, pd.Series):
x = list(x)
# We cannot create GeometryCollection here so all types
# must be the same. If there is more than one element,
# they cannot be Multi*, i.e., can't pass in combination of
# Point and MultiPoint... or even just MultiPoint
t = x[0].geom_type
if not all(g.geom_type == t for g in x):
raise ValueError("Geometry type must be homogeneous")
if len(x) > 1 and t.startswith("Multi"):
raise ValueError("Cannot collect {0}. Must have single geometries".format(t))
if len(x) == 1 and (t.startswith("Multi") or not multi):
# If there's only one single part geom and we're not forcing to
# multi, then just return it
return x[0]
return _multi_type_map[t](x)