refactor: excel parse
This commit is contained in:
@@ -0,0 +1,614 @@
|
||||
import json
|
||||
from packaging.version import Version
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pyarrow as pa
|
||||
from numpy.typing import NDArray
|
||||
|
||||
import shapely
|
||||
from shapely import GeometryType
|
||||
|
||||
from geopandas import GeoDataFrame
|
||||
from geopandas._compat import SHAPELY_GE_204
|
||||
from geopandas.array import from_shapely, from_wkb
|
||||
|
||||
GEOARROW_ENCODINGS = [
|
||||
"point",
|
||||
"linestring",
|
||||
"polygon",
|
||||
"multipoint",
|
||||
"multilinestring",
|
||||
"multipolygon",
|
||||
]
|
||||
|
||||
|
||||
## GeoPandas -> GeoArrow
|
||||
|
||||
|
||||
class ArrowTable:
|
||||
"""
|
||||
Wrapper class for Arrow data.
|
||||
|
||||
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
|
||||
``__arrow_c_stream__`` method). This object can then be consumed by
|
||||
your Arrow implementation of choice that supports this protocol.
|
||||
|
||||
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> import pyarrow as pa
|
||||
>>> pa.table(gdf.to_arrow()) # doctest: +SKIP
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, pa_table):
|
||||
self._pa_table = pa_table
|
||||
|
||||
def __arrow_c_stream__(self, requested_schema=None):
|
||||
return self._pa_table.__arrow_c_stream__(requested_schema=requested_schema)
|
||||
|
||||
|
||||
class GeoArrowArray:
|
||||
"""
|
||||
Wrapper class for a geometry array as Arrow data.
|
||||
|
||||
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
|
||||
``__arrow_c_array/stream__`` method). This object can then be consumed by
|
||||
your Arrow implementation of choice that supports this protocol.
|
||||
|
||||
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
|
||||
|
||||
Example
|
||||
-------
|
||||
>>> import pyarrow as pa
|
||||
>>> pa.array(ser.to_arrow()) # doctest: +SKIP
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, pa_field, pa_array):
|
||||
self._pa_array = pa_array
|
||||
self._pa_field = pa_field
|
||||
|
||||
def __arrow_c_array__(self, requested_schema=None):
|
||||
if requested_schema is not None:
|
||||
raise NotImplementedError(
|
||||
"Requested schema is not supported for geometry arrays"
|
||||
)
|
||||
return (
|
||||
self._pa_field.__arrow_c_schema__(),
|
||||
self._pa_array.__arrow_c_array__()[1],
|
||||
)
|
||||
|
||||
|
||||
def geopandas_to_arrow(
|
||||
df,
|
||||
index=None,
|
||||
geometry_encoding="WKB",
|
||||
interleaved=True,
|
||||
include_z=None,
|
||||
):
|
||||
"""
|
||||
Convert GeoDataFrame to a pyarrow.Table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame
|
||||
The GeoDataFrame to convert.
|
||||
index : bool, default None
|
||||
If ``True``, always include the dataframe's index(es) as columns
|
||||
in the file output.
|
||||
If ``False``, the index(es) will not be written to the file.
|
||||
If ``None``, the index(ex) will be included as columns in the file
|
||||
output except `RangeIndex` which is stored as metadata only.
|
||||
geometry_encoding : {'WKB', 'geoarrow' }, default 'WKB'
|
||||
The GeoArrow encoding to use for the data conversion.
|
||||
interleaved : bool, default True
|
||||
Only relevant for 'geoarrow' encoding. If True, the geometries'
|
||||
coordinates are interleaved in a single fixed size list array.
|
||||
If False, the coordinates are stored as separate arrays in a
|
||||
struct type.
|
||||
include_z : bool, default None
|
||||
Only relevant for 'geoarrow' encoding (for WKB, the dimensionality
|
||||
of the individial geometries is preserved).
|
||||
If False, return 2D geometries. If True, include the third dimension
|
||||
in the output (if a geometry has no third dimension, the z-coordinates
|
||||
will be NaN). By default, will infer the dimensionality from the
|
||||
input geometries. Note that this inference can be unreliable with
|
||||
empty geometries (for a guaranteed result, it is recommended to
|
||||
specify the keyword).
|
||||
|
||||
"""
|
||||
mask = df.dtypes == "geometry"
|
||||
geometry_columns = df.columns[mask]
|
||||
geometry_indices = np.asarray(mask).nonzero()[0]
|
||||
|
||||
df_attr = pd.DataFrame(df.copy(deep=False))
|
||||
|
||||
# replace geometry columns with dummy values -> will get converted to
|
||||
# Arrow null column (not holding any memory), so we can afterwards
|
||||
# fill the resulting table with the correct geometry fields
|
||||
for col in geometry_columns:
|
||||
df_attr[col] = None
|
||||
|
||||
table = pa.Table.from_pandas(df_attr, preserve_index=index)
|
||||
|
||||
geometry_encoding_dict = {}
|
||||
|
||||
if geometry_encoding.lower() == "geoarrow":
|
||||
if Version(pa.__version__) < Version("10.0.0"):
|
||||
raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
|
||||
|
||||
# Encode all geometry columns to GeoArrow
|
||||
for i, col in zip(geometry_indices, geometry_columns):
|
||||
field, geom_arr = construct_geometry_array(
|
||||
np.array(df[col].array),
|
||||
include_z=include_z,
|
||||
field_name=col,
|
||||
crs=df[col].crs,
|
||||
interleaved=interleaved,
|
||||
)
|
||||
table = table.set_column(i, field, geom_arr)
|
||||
geometry_encoding_dict[col] = (
|
||||
field.metadata[b"ARROW:extension:name"]
|
||||
.decode()
|
||||
.removeprefix("geoarrow.")
|
||||
)
|
||||
|
||||
elif geometry_encoding.lower() == "wkb":
|
||||
# Encode all geometry columns to WKB
|
||||
for i, col in zip(geometry_indices, geometry_columns):
|
||||
field, wkb_arr = construct_wkb_array(
|
||||
np.asarray(df[col].array), field_name=col, crs=df[col].crs
|
||||
)
|
||||
table = table.set_column(i, field, wkb_arr)
|
||||
geometry_encoding_dict[col] = "WKB"
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
|
||||
)
|
||||
return table, geometry_encoding_dict
|
||||
|
||||
|
||||
def construct_wkb_array(
|
||||
shapely_arr: NDArray[np.object_],
|
||||
*,
|
||||
field_name: str = "geometry",
|
||||
crs: Optional[str] = None,
|
||||
) -> Tuple[pa.Field, pa.Array]:
|
||||
|
||||
if shapely.geos_version > (3, 10, 0):
|
||||
kwargs = {"flavor": "iso"}
|
||||
else:
|
||||
if shapely.has_z(shapely_arr).any():
|
||||
raise ValueError("Cannot write 3D geometries with GEOS<3.10")
|
||||
kwargs = {}
|
||||
|
||||
wkb_arr = shapely.to_wkb(shapely_arr, **kwargs)
|
||||
extension_metadata = {"ARROW:extension:name": "geoarrow.wkb"}
|
||||
if crs is not None:
|
||||
extension_metadata["ARROW:extension:metadata"] = json.dumps(
|
||||
{"crs": crs.to_json()}
|
||||
)
|
||||
else:
|
||||
# In theory this should not be needed, but otherwise pyarrow < 17
|
||||
# crashes on receiving such data through C Data Interface
|
||||
# https://github.com/apache/arrow/issues/41741
|
||||
extension_metadata["ARROW:extension:metadata"] = "{}"
|
||||
|
||||
field = pa.field(
|
||||
field_name, type=pa.binary(), nullable=True, metadata=extension_metadata
|
||||
)
|
||||
parr = pa.array(np.asarray(wkb_arr), pa.binary())
|
||||
return field, parr
|
||||
|
||||
|
||||
def _convert_inner_coords(coords, interleaved, dims, mask=None):
|
||||
if interleaved:
|
||||
coords_field = pa.field(dims, pa.float64(), nullable=False)
|
||||
typ = pa.list_(coords_field, len(dims))
|
||||
if mask is None:
|
||||
# mask keyword only added in pyarrow 15.0.0
|
||||
parr = pa.FixedSizeListArray.from_arrays(coords.ravel(), type=typ)
|
||||
else:
|
||||
parr = pa.FixedSizeListArray.from_arrays(
|
||||
coords.ravel(), type=typ, mask=mask
|
||||
)
|
||||
else:
|
||||
if dims == "xy":
|
||||
fields = [
|
||||
pa.field("x", pa.float64(), nullable=False),
|
||||
pa.field("y", pa.float64(), nullable=False),
|
||||
]
|
||||
parr = pa.StructArray.from_arrays(
|
||||
[coords[:, 0].copy(), coords[:, 1].copy()], fields=fields, mask=mask
|
||||
)
|
||||
else:
|
||||
fields = [
|
||||
pa.field("x", pa.float64(), nullable=False),
|
||||
pa.field("y", pa.float64(), nullable=False),
|
||||
pa.field("z", pa.float64(), nullable=False),
|
||||
]
|
||||
parr = pa.StructArray.from_arrays(
|
||||
[coords[:, 0].copy(), coords[:, 1].copy(), coords[:, 2].copy()],
|
||||
fields=fields,
|
||||
mask=mask,
|
||||
)
|
||||
return parr
|
||||
|
||||
|
||||
def _linestring_type(point_type):
|
||||
return pa.list_(pa.field("vertices", point_type, nullable=False))
|
||||
|
||||
|
||||
def _polygon_type(point_type):
|
||||
return pa.list_(
|
||||
pa.field(
|
||||
"rings",
|
||||
pa.list_(pa.field("vertices", point_type, nullable=False)),
|
||||
nullable=False,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _multipoint_type(point_type):
|
||||
return pa.list_(pa.field("points", point_type, nullable=False))
|
||||
|
||||
|
||||
def _multilinestring_type(point_type):
|
||||
return pa.list_(
|
||||
pa.field("linestrings", _linestring_type(point_type), nullable=False)
|
||||
)
|
||||
|
||||
|
||||
def _multipolygon_type(point_type):
|
||||
return pa.list_(pa.field("polygons", _polygon_type(point_type), nullable=False))
|
||||
|
||||
|
||||
def construct_geometry_array(
|
||||
shapely_arr: NDArray[np.object_],
|
||||
include_z: Optional[bool] = None,
|
||||
*,
|
||||
field_name: str = "geometry",
|
||||
crs: Optional[str] = None,
|
||||
interleaved: bool = True,
|
||||
) -> Tuple[pa.Field, pa.Array]:
|
||||
# NOTE: this implementation returns a (field, array) pair so that it can set the
|
||||
# extension metadata on the field without instantiating extension types into the
|
||||
# global pyarrow registry
|
||||
geom_type, coords, offsets = shapely.to_ragged_array(
|
||||
shapely_arr, include_z=include_z
|
||||
)
|
||||
|
||||
mask = shapely.is_missing(shapely_arr)
|
||||
if mask.any():
|
||||
if (
|
||||
geom_type == GeometryType.POINT
|
||||
and interleaved
|
||||
and Version(pa.__version__) < Version("15.0.0")
|
||||
):
|
||||
raise ValueError(
|
||||
"Converting point geometries with missing values is not supported "
|
||||
"for interleaved coordinates with pyarrow < 15.0.0. Please "
|
||||
"upgrade to a newer version of pyarrow."
|
||||
)
|
||||
mask = pa.array(mask, type=pa.bool_())
|
||||
|
||||
if geom_type == GeometryType.POINT and not SHAPELY_GE_204:
|
||||
# bug in shapely < 2.0.4, see https://github.com/shapely/shapely/pull/2034
|
||||
# this workaround only works if there are no empty points
|
||||
indices = np.nonzero(mask)[0]
|
||||
indices = indices - np.arange(len(indices))
|
||||
coords = np.insert(coords, indices, np.nan, axis=0)
|
||||
|
||||
else:
|
||||
mask = None
|
||||
|
||||
if coords.shape[-1] == 2:
|
||||
dims = "xy"
|
||||
elif coords.shape[-1] == 3:
|
||||
dims = "xyz"
|
||||
else:
|
||||
raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
|
||||
|
||||
extension_metadata: Dict[str, str] = {}
|
||||
if crs is not None:
|
||||
extension_metadata["ARROW:extension:metadata"] = json.dumps(
|
||||
{"crs": crs.to_json()}
|
||||
)
|
||||
else:
|
||||
# In theory this should not be needed, but otherwise pyarrow < 17
|
||||
# crashes on receiving such data through C Data Interface
|
||||
# https://github.com/apache/arrow/issues/41741
|
||||
extension_metadata["ARROW:extension:metadata"] = "{}"
|
||||
|
||||
if geom_type == GeometryType.POINT:
|
||||
parr = _convert_inner_coords(coords, interleaved, dims, mask=mask)
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.point"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.LINESTRING:
|
||||
assert len(offsets) == 1, "Expected one offsets array"
|
||||
(geom_offsets,) = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
parr = pa.ListArray.from_arrays(
|
||||
pa.array(geom_offsets), _parr, _linestring_type(_parr.type), mask=mask
|
||||
)
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.POLYGON:
|
||||
assert len(offsets) == 2, "Expected two offsets arrays"
|
||||
ring_offsets, geom_offsets = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
|
||||
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
|
||||
parr = parr.cast(_polygon_type(_parr.type))
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.MULTIPOINT:
|
||||
assert len(offsets) == 1, "Expected one offsets array"
|
||||
(geom_offsets,) = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
parr = pa.ListArray.from_arrays(
|
||||
pa.array(geom_offsets), _parr, type=_multipoint_type(_parr.type), mask=mask
|
||||
)
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.MULTILINESTRING:
|
||||
assert len(offsets) == 2, "Expected two offsets arrays"
|
||||
ring_offsets, geom_offsets = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
|
||||
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
|
||||
parr = parr.cast(_multilinestring_type(_parr.type))
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
elif geom_type == GeometryType.MULTIPOLYGON:
|
||||
assert len(offsets) == 3, "Expected three offsets arrays"
|
||||
ring_offsets, polygon_offsets, geom_offsets = offsets
|
||||
_parr = _convert_inner_coords(coords, interleaved, dims)
|
||||
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
|
||||
_parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
|
||||
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2, mask=mask)
|
||||
parr = parr.cast(_multipolygon_type(_parr.type))
|
||||
extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
|
||||
field = pa.field(
|
||||
field_name,
|
||||
parr.type,
|
||||
nullable=True,
|
||||
metadata=extension_metadata,
|
||||
)
|
||||
return field, parr
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported type for geoarrow: {geom_type}")
|
||||
|
||||
|
||||
## GeoArrow -> GeoPandas
|
||||
|
||||
|
||||
def _get_arrow_geometry_field(field):
|
||||
if (meta := field.metadata) is not None:
|
||||
if (ext_name := meta.get(b"ARROW:extension:name", None)) is not None:
|
||||
if ext_name.startswith(b"geoarrow."):
|
||||
if (
|
||||
ext_meta := meta.get(b"ARROW:extension:metadata", None)
|
||||
) is not None:
|
||||
ext_meta = json.loads(ext_meta.decode())
|
||||
return ext_name.decode(), ext_meta
|
||||
|
||||
if isinstance(field.type, pa.ExtensionType):
|
||||
ext_name = field.type.extension_name
|
||||
if ext_name.startswith("geoarrow."):
|
||||
ext_meta_ser = field.type.__arrow_ext_serialize__()
|
||||
if ext_meta_ser:
|
||||
ext_meta = json.loads(ext_meta_ser.decode())
|
||||
else:
|
||||
ext_meta = None
|
||||
return ext_name, ext_meta
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def arrow_to_geopandas(table, geometry=None):
|
||||
"""
|
||||
Convert Arrow table object to a GeoDataFrame based on GeoArrow extension types.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : pyarrow.Table
|
||||
The Arrow table to convert.
|
||||
geometry : str, default None
|
||||
The name of the geometry column to set as the active geometry
|
||||
column. If None, the first geometry column found will be used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
"""
|
||||
if not isinstance(table, pa.Table):
|
||||
table = pa.table(table)
|
||||
|
||||
geom_fields = []
|
||||
|
||||
for i, field in enumerate(table.schema):
|
||||
geom = _get_arrow_geometry_field(field)
|
||||
if geom is not None:
|
||||
geom_fields.append((i, field.name, *geom))
|
||||
|
||||
if len(geom_fields) == 0:
|
||||
raise ValueError("No geometry column found in the Arrow table.")
|
||||
|
||||
table_attr = table.drop([f[1] for f in geom_fields])
|
||||
df = table_attr.to_pandas()
|
||||
|
||||
for i, col, ext_name, ext_meta in geom_fields:
|
||||
crs = None
|
||||
if ext_meta is not None and "crs" in ext_meta:
|
||||
crs = ext_meta["crs"]
|
||||
|
||||
if ext_name == "geoarrow.wkb":
|
||||
geom_arr = from_wkb(np.array(table[col]), crs=crs)
|
||||
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
|
||||
|
||||
geom_arr = from_shapely(
|
||||
construct_shapely_array(table[col].combine_chunks(), ext_name), crs=crs
|
||||
)
|
||||
else:
|
||||
raise TypeError(f"Unknown GeoArrow extension type: {ext_name}")
|
||||
|
||||
df.insert(i, col, geom_arr)
|
||||
|
||||
return GeoDataFrame(df, geometry=geometry or geom_fields[0][1])
|
||||
|
||||
|
||||
def arrow_to_geometry_array(arr):
|
||||
"""
|
||||
Convert Arrow array object (representing single GeoArrow array) to a
|
||||
geopandas GeometryArray.
|
||||
|
||||
Specifically for GeoSeries.from_arrow.
|
||||
"""
|
||||
if Version(pa.__version__) < Version("14.0.0"):
|
||||
raise ValueError("Importing from Arrow requires pyarrow >= 14.0.")
|
||||
|
||||
schema_capsule, array_capsule = arr.__arrow_c_array__()
|
||||
field = pa.Field._import_from_c_capsule(schema_capsule)
|
||||
pa_arr = pa.Array._import_from_c_capsule(field.__arrow_c_schema__(), array_capsule)
|
||||
|
||||
geom_info = _get_arrow_geometry_field(field)
|
||||
if geom_info is None:
|
||||
raise ValueError("No GeoArrow geometry field found.")
|
||||
ext_name, ext_meta = geom_info
|
||||
|
||||
crs = None
|
||||
if ext_meta is not None and "crs" in ext_meta:
|
||||
crs = ext_meta["crs"]
|
||||
|
||||
if ext_name == "geoarrow.wkb":
|
||||
geom_arr = from_wkb(np.array(pa_arr), crs=crs)
|
||||
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
|
||||
|
||||
geom_arr = from_shapely(construct_shapely_array(pa_arr, ext_name), crs=crs)
|
||||
else:
|
||||
raise ValueError(f"Unknown GeoArrow extension type: {ext_name}")
|
||||
|
||||
return geom_arr
|
||||
|
||||
|
||||
def _get_inner_coords(arr):
|
||||
if pa.types.is_struct(arr.type):
|
||||
if arr.type.num_fields == 2:
|
||||
coords = np.column_stack(
|
||||
[np.asarray(arr.field("x")), np.asarray(arr.field("y"))]
|
||||
)
|
||||
else:
|
||||
coords = np.column_stack(
|
||||
[
|
||||
np.asarray(arr.field("x")),
|
||||
np.asarray(arr.field("y")),
|
||||
np.asarray(arr.field("z")),
|
||||
]
|
||||
)
|
||||
return coords
|
||||
else:
|
||||
# fixed size list
|
||||
return np.asarray(arr.values).reshape(len(arr), -1)
|
||||
|
||||
|
||||
def construct_shapely_array(arr: pa.Array, extension_name: str):
|
||||
"""
|
||||
Construct a NumPy array of shapely geometries from a pyarrow.Array
|
||||
with GeoArrow extension type.
|
||||
|
||||
"""
|
||||
if isinstance(arr, pa.ExtensionArray):
|
||||
arr = arr.storage
|
||||
|
||||
if extension_name == "geoarrow.point":
|
||||
coords = _get_inner_coords(arr)
|
||||
result = shapely.from_ragged_array(GeometryType.POINT, coords, None)
|
||||
|
||||
elif extension_name == "geoarrow.linestring":
|
||||
coords = _get_inner_coords(arr.values)
|
||||
offsets1 = np.asarray(arr.offsets)
|
||||
offsets = (offsets1,)
|
||||
result = shapely.from_ragged_array(GeometryType.LINESTRING, coords, offsets)
|
||||
|
||||
elif extension_name == "geoarrow.polygon":
|
||||
coords = _get_inner_coords(arr.values.values)
|
||||
offsets2 = np.asarray(arr.offsets)
|
||||
offsets1 = np.asarray(arr.values.offsets)
|
||||
offsets = (offsets1, offsets2)
|
||||
result = shapely.from_ragged_array(GeometryType.POLYGON, coords, offsets)
|
||||
|
||||
elif extension_name == "geoarrow.multipoint":
|
||||
coords = _get_inner_coords(arr.values)
|
||||
offsets1 = np.asarray(arr.offsets)
|
||||
offsets = (offsets1,)
|
||||
result = shapely.from_ragged_array(GeometryType.MULTIPOINT, coords, offsets)
|
||||
|
||||
elif extension_name == "geoarrow.multilinestring":
|
||||
coords = _get_inner_coords(arr.values.values)
|
||||
offsets2 = np.asarray(arr.offsets)
|
||||
offsets1 = np.asarray(arr.values.offsets)
|
||||
offsets = (offsets1, offsets2)
|
||||
result = shapely.from_ragged_array(
|
||||
GeometryType.MULTILINESTRING, coords, offsets
|
||||
)
|
||||
|
||||
elif extension_name == "geoarrow.multipolygon":
|
||||
coords = _get_inner_coords(arr.values.values.values)
|
||||
offsets3 = np.asarray(arr.offsets)
|
||||
offsets2 = np.asarray(arr.values.offsets)
|
||||
offsets1 = np.asarray(arr.values.values.offsets)
|
||||
offsets = (offsets1, offsets2, offsets3)
|
||||
result = shapely.from_ragged_array(GeometryType.MULTIPOLYGON, coords, offsets)
|
||||
|
||||
else:
|
||||
raise ValueError(extension_name)
|
||||
|
||||
# apply validity mask
|
||||
if arr.null_count:
|
||||
mask = np.asarray(arr.is_null())
|
||||
result = np.where(mask, None, result)
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,72 @@
|
||||
from packaging.version import Version
|
||||
|
||||
import pyarrow
|
||||
|
||||
_ERROR_MSG = """\
|
||||
Disallowed deserialization of 'arrow.py_extension_type':
|
||||
storage_type = {storage_type}
|
||||
serialized = {serialized}
|
||||
pickle disassembly:\n{pickle_disassembly}
|
||||
|
||||
Reading of untrusted Parquet or Feather files with a PyExtensionType column
|
||||
allows arbitrary code execution.
|
||||
If you trust this file, you can enable reading the extension type by one of:
|
||||
|
||||
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
|
||||
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
|
||||
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
|
||||
|
||||
We strongly recommend updating your Parquet/Feather files to use extension types
|
||||
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
|
||||
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
|
||||
for more details.
|
||||
"""
|
||||
|
||||
|
||||
def patch_pyarrow():
|
||||
# starting from pyarrow 14.0.1, it has its own mechanism
|
||||
if Version(pyarrow.__version__) >= Version("14.0.1"):
|
||||
return
|
||||
|
||||
# if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
|
||||
# installed, use this instead (which also ensures it works if they had
|
||||
# called `pyarrow_hotfix.uninstall()`)
|
||||
try:
|
||||
import pyarrow_hotfix # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
return
|
||||
|
||||
# if the hotfix is already installed and enabled
|
||||
if getattr(pyarrow, "_hotfix_installed", False):
|
||||
return
|
||||
|
||||
class ForbiddenExtensionType(pyarrow.ExtensionType):
|
||||
def __arrow_ext_serialize__(self):
|
||||
return b""
|
||||
|
||||
@classmethod
|
||||
def __arrow_ext_deserialize__(cls, storage_type, serialized):
|
||||
import io
|
||||
import pickletools
|
||||
|
||||
out = io.StringIO()
|
||||
pickletools.dis(serialized, out)
|
||||
raise RuntimeError(
|
||||
_ERROR_MSG.format(
|
||||
storage_type=storage_type,
|
||||
serialized=serialized,
|
||||
pickle_disassembly=out.getvalue(),
|
||||
)
|
||||
)
|
||||
|
||||
pyarrow.unregister_extension_type("arrow.py_extension_type")
|
||||
pyarrow.register_extension_type(
|
||||
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
|
||||
)
|
||||
|
||||
pyarrow._hotfix_installed = True
|
||||
|
||||
|
||||
patch_pyarrow()
|
||||
@@ -0,0 +1,913 @@
|
||||
import json
|
||||
import warnings
|
||||
from packaging.version import Version
|
||||
|
||||
import numpy as np
|
||||
from pandas import DataFrame, Series
|
||||
|
||||
import shapely
|
||||
|
||||
import geopandas
|
||||
from geopandas import GeoDataFrame
|
||||
from geopandas._compat import import_optional_dependency
|
||||
from geopandas.array import from_shapely, from_wkb
|
||||
|
||||
from .file import _expand_user
|
||||
|
||||
METADATA_VERSION = "1.0.0"
|
||||
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
|
||||
GEOARROW_ENCODINGS = [
|
||||
"point",
|
||||
"linestring",
|
||||
"polygon",
|
||||
"multipoint",
|
||||
"multilinestring",
|
||||
"multipolygon",
|
||||
]
|
||||
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
|
||||
|
||||
# reference: https://github.com/opengeospatial/geoparquet
|
||||
|
||||
# Metadata structure:
|
||||
# {
|
||||
# "geo": {
|
||||
# "columns": {
|
||||
# "<name>": {
|
||||
# "encoding": "WKB"
|
||||
# "geometry_types": <list of str: REQUIRED>
|
||||
# "crs": "<PROJJSON or None: OPTIONAL>",
|
||||
# "orientation": "<'counterclockwise' or None: OPTIONAL>"
|
||||
# "edges": "planar"
|
||||
# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
|
||||
# "epoch": <float: OPTIONAL>
|
||||
# }
|
||||
# },
|
||||
# "primary_column": "<str: REQUIRED>",
|
||||
# "version": "<METADATA_VERSION>",
|
||||
#
|
||||
# # Additional GeoPandas specific metadata (not in metadata spec)
|
||||
# "creator": {
|
||||
# "library": "geopandas",
|
||||
# "version": "<geopandas.__version__>"
|
||||
# }
|
||||
# }
|
||||
# }
|
||||
|
||||
|
||||
def _is_fsspec_url(url):
|
||||
return (
|
||||
isinstance(url, str)
|
||||
and "://" in url
|
||||
and not url.startswith(("http://", "https://"))
|
||||
)
|
||||
|
||||
|
||||
def _remove_id_from_member_of_ensembles(json_dict):
|
||||
"""
|
||||
Older PROJ versions will not recognize IDs of datum ensemble members that
|
||||
were added in more recent PROJ database versions.
|
||||
|
||||
Cf https://github.com/opengeospatial/geoparquet/discussions/110
|
||||
and https://github.com/OSGeo/PROJ/pull/3221
|
||||
|
||||
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
|
||||
"""
|
||||
for key, value in json_dict.items():
|
||||
if isinstance(value, dict):
|
||||
_remove_id_from_member_of_ensembles(value)
|
||||
elif key == "members" and isinstance(value, list):
|
||||
for member in value:
|
||||
member.pop("id", None)
|
||||
|
||||
|
||||
# type ids 0 to 7
|
||||
_geometry_type_names = [
|
||||
"Point",
|
||||
"LineString",
|
||||
"LineString",
|
||||
"Polygon",
|
||||
"MultiPoint",
|
||||
"MultiLineString",
|
||||
"MultiPolygon",
|
||||
"GeometryCollection",
|
||||
]
|
||||
_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
|
||||
|
||||
|
||||
def _get_geometry_types(series):
|
||||
"""
|
||||
Get unique geometry types from a GeoSeries.
|
||||
"""
|
||||
arr_geometry_types = shapely.get_type_id(series.array._data)
|
||||
# ensure to include "... Z" for 3D geometries
|
||||
has_z = shapely.has_z(series.array._data)
|
||||
arr_geometry_types[has_z] += 8
|
||||
|
||||
geometry_types = Series(arr_geometry_types).unique().tolist()
|
||||
# drop missing values (shapely.get_type_id returns -1 for those)
|
||||
if -1 in geometry_types:
|
||||
geometry_types.remove(-1)
|
||||
|
||||
return sorted([_geometry_type_names[idx] for idx in geometry_types])
|
||||
|
||||
|
||||
def _create_metadata(
|
||||
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
|
||||
):
|
||||
"""Create and encode geo metadata dict.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame
|
||||
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
|
||||
GeoParquet specification version; if not provided will default to
|
||||
latest supported version.
|
||||
write_covering_bbox : bool, default False
|
||||
Writes the bounding box column for each row entry with column
|
||||
name 'bbox'. Writing a bbox column can be computationally
|
||||
expensive, hence is default setting is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
"""
|
||||
if schema_version is None:
|
||||
if geometry_encoding and any(
|
||||
encoding != "WKB" for encoding in geometry_encoding.values()
|
||||
):
|
||||
schema_version = "1.1.0"
|
||||
else:
|
||||
schema_version = METADATA_VERSION
|
||||
|
||||
if schema_version not in SUPPORTED_VERSIONS:
|
||||
raise ValueError(
|
||||
f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
|
||||
)
|
||||
|
||||
# Construct metadata for each geometry
|
||||
column_metadata = {}
|
||||
for col in df.columns[df.dtypes == "geometry"]:
|
||||
series = df[col]
|
||||
|
||||
geometry_types = _get_geometry_types(series)
|
||||
if schema_version[0] == "0":
|
||||
geometry_types_name = "geometry_type"
|
||||
if len(geometry_types) == 1:
|
||||
geometry_types = geometry_types[0]
|
||||
else:
|
||||
geometry_types_name = "geometry_types"
|
||||
|
||||
crs = None
|
||||
if series.crs:
|
||||
if schema_version == "0.1.0":
|
||||
crs = series.crs.to_wkt()
|
||||
else: # version >= 0.4.0
|
||||
crs = series.crs.to_json_dict()
|
||||
_remove_id_from_member_of_ensembles(crs)
|
||||
|
||||
column_metadata[col] = {
|
||||
"encoding": geometry_encoding[col],
|
||||
"crs": crs,
|
||||
geometry_types_name: geometry_types,
|
||||
}
|
||||
|
||||
bbox = series.total_bounds.tolist()
|
||||
if np.isfinite(bbox).all():
|
||||
# don't add bbox with NaNs for empty / all-NA geometry column
|
||||
column_metadata[col]["bbox"] = bbox
|
||||
|
||||
if write_covering_bbox:
|
||||
column_metadata[col]["covering"] = {
|
||||
"bbox": {
|
||||
"xmin": ["bbox", "xmin"],
|
||||
"ymin": ["bbox", "ymin"],
|
||||
"xmax": ["bbox", "xmax"],
|
||||
"ymax": ["bbox", "ymax"],
|
||||
},
|
||||
}
|
||||
|
||||
return {
|
||||
"primary_column": df._geometry_column_name,
|
||||
"columns": column_metadata,
|
||||
"version": schema_version,
|
||||
"creator": {"library": "geopandas", "version": geopandas.__version__},
|
||||
}
|
||||
|
||||
|
||||
def _encode_metadata(metadata):
|
||||
"""Encode metadata dict to UTF-8 JSON string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metadata : dict
|
||||
|
||||
Returns
|
||||
-------
|
||||
UTF-8 encoded JSON string
|
||||
"""
|
||||
return json.dumps(metadata).encode("utf-8")
|
||||
|
||||
|
||||
def _decode_metadata(metadata_str):
|
||||
"""Decode a UTF-8 encoded JSON string to dict
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metadata_str : string (UTF-8 encoded)
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
"""
|
||||
if metadata_str is None:
|
||||
return None
|
||||
|
||||
return json.loads(metadata_str.decode("utf-8"))
|
||||
|
||||
|
||||
def _validate_dataframe(df):
|
||||
"""Validate that the GeoDataFrame conforms to requirements for writing
|
||||
to Parquet format.
|
||||
|
||||
Raises `ValueError` if the GeoDataFrame is not valid.
|
||||
|
||||
copied from `pandas.io.parquet`
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame
|
||||
"""
|
||||
|
||||
if not isinstance(df, DataFrame):
|
||||
raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
|
||||
|
||||
# must have value column names (strings only)
|
||||
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
|
||||
raise ValueError("Writing to Parquet/Feather requires string column names")
|
||||
|
||||
# index level names must be strings
|
||||
valid_names = all(
|
||||
isinstance(name, str) for name in df.index.names if name is not None
|
||||
)
|
||||
if not valid_names:
|
||||
raise ValueError("Index level names must be strings")
|
||||
|
||||
|
||||
def _validate_geo_metadata(metadata):
|
||||
"""Validate geo metadata.
|
||||
Must not be empty, and must contain the structure specified above.
|
||||
|
||||
Raises ValueError if metadata is not valid.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metadata : dict
|
||||
"""
|
||||
|
||||
if not metadata:
|
||||
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
|
||||
|
||||
# version was schema_version in 0.1.0
|
||||
version = metadata.get("version", metadata.get("schema_version"))
|
||||
if not version:
|
||||
raise ValueError(
|
||||
"'geo' metadata in Parquet/Feather file is missing required key: "
|
||||
"'version'"
|
||||
)
|
||||
|
||||
required_keys = ("primary_column", "columns")
|
||||
for key in required_keys:
|
||||
if metadata.get(key, None) is None:
|
||||
raise ValueError(
|
||||
"'geo' metadata in Parquet/Feather file is missing required key: "
|
||||
"'{key}'".format(key=key)
|
||||
)
|
||||
|
||||
if not isinstance(metadata["columns"], dict):
|
||||
raise ValueError("'columns' in 'geo' metadata must be a dict")
|
||||
|
||||
# Validate that geometry columns have required metadata and values
|
||||
# leaving out "geometry_type" for compatibility with 0.1
|
||||
required_col_keys = ("encoding",)
|
||||
for col, column_metadata in metadata["columns"].items():
|
||||
for key in required_col_keys:
|
||||
if key not in column_metadata:
|
||||
raise ValueError(
|
||||
"'geo' metadata in Parquet/Feather file is missing required key "
|
||||
"'{key}' for column '{col}'".format(key=key, col=col)
|
||||
)
|
||||
|
||||
if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
|
||||
raise ValueError(
|
||||
"Only WKB geometry encoding or one of the native encodings "
|
||||
f"({GEOARROW_ENCODINGS!r}) are supported, "
|
||||
f"got: {column_metadata['encoding']}"
|
||||
)
|
||||
|
||||
if column_metadata.get("edges", "planar") == "spherical":
|
||||
warnings.warn(
|
||||
f"The geo metadata indicate that column '{col}' has spherical edges, "
|
||||
"but because GeoPandas currently does not support spherical "
|
||||
"geometry, it ignores this metadata and will interpret the edges of "
|
||||
"the geometries as planar.",
|
||||
UserWarning,
|
||||
stacklevel=4,
|
||||
)
|
||||
|
||||
if "covering" in column_metadata:
|
||||
covering = column_metadata["covering"]
|
||||
if "bbox" in covering:
|
||||
bbox = covering["bbox"]
|
||||
for var in ["xmin", "ymin", "xmax", "ymax"]:
|
||||
if var not in bbox.keys():
|
||||
raise ValueError("Metadata for bbox column is malformed.")
|
||||
|
||||
|
||||
def _geopandas_to_arrow(
|
||||
df,
|
||||
index=None,
|
||||
geometry_encoding="WKB",
|
||||
schema_version=None,
|
||||
write_covering_bbox=None,
|
||||
):
|
||||
"""
|
||||
Helper function with main, shared logic for to_parquet/to_feather.
|
||||
"""
|
||||
from pyarrow import StructArray
|
||||
|
||||
from geopandas.io._geoarrow import geopandas_to_arrow
|
||||
|
||||
_validate_dataframe(df)
|
||||
|
||||
if schema_version is not None:
|
||||
if geometry_encoding != "WKB" and schema_version != "1.1.0":
|
||||
raise ValueError(
|
||||
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
|
||||
)
|
||||
|
||||
table, geometry_encoding_dict = geopandas_to_arrow(
|
||||
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
|
||||
)
|
||||
geo_metadata = _create_metadata(
|
||||
df,
|
||||
schema_version=schema_version,
|
||||
geometry_encoding=geometry_encoding_dict,
|
||||
write_covering_bbox=write_covering_bbox,
|
||||
)
|
||||
|
||||
if write_covering_bbox:
|
||||
if "bbox" in df.columns:
|
||||
raise ValueError(
|
||||
"An existing column 'bbox' already exists in the dataframe. "
|
||||
"Please rename to write covering bbox."
|
||||
)
|
||||
bounds = df.bounds
|
||||
bbox_array = StructArray.from_arrays(
|
||||
[bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
|
||||
names=["xmin", "ymin", "xmax", "ymax"],
|
||||
)
|
||||
table = table.append_column("bbox", bbox_array)
|
||||
|
||||
# Store geopandas specific file-level metadata
|
||||
# This must be done AFTER creating the table or it is not persisted
|
||||
metadata = table.schema.metadata
|
||||
metadata.update({b"geo": _encode_metadata(geo_metadata)})
|
||||
|
||||
return table.replace_schema_metadata(metadata)
|
||||
|
||||
|
||||
def _to_parquet(
|
||||
df,
|
||||
path,
|
||||
index=None,
|
||||
compression="snappy",
|
||||
geometry_encoding="WKB",
|
||||
schema_version=None,
|
||||
write_covering_bbox=False,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Write a GeoDataFrame to the Parquet format.
|
||||
|
||||
Any geometry columns present are serialized to WKB format in the file.
|
||||
|
||||
Requires 'pyarrow'.
|
||||
|
||||
This is tracking version 1.0.0 of the GeoParquet specification at:
|
||||
https://github.com/opengeospatial/geoparquet. Writing older versions is
|
||||
supported using the `schema_version` keyword.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
index : bool, default None
|
||||
If ``True``, always include the dataframe's index(es) as columns
|
||||
in the file output.
|
||||
If ``False``, the index(es) will not be written to the file.
|
||||
If ``None``, the index(ex) will be included as columns in the file
|
||||
output except `RangeIndex` which is stored as metadata only.
|
||||
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
|
||||
Name of the compression to use. Use ``None`` for no compression.
|
||||
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
|
||||
The encoding to use for the geometry columns. Defaults to "WKB"
|
||||
for maximum interoperability. Specify "geoarrow" to use one of the
|
||||
native GeoArrow-based single-geometry type encodings.
|
||||
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
|
||||
GeoParquet specification version; if not provided will default to
|
||||
latest supported version.
|
||||
write_covering_bbox : bool, default False
|
||||
Writes the bounding box column for each row entry with column
|
||||
name 'bbox'. Writing a bbox column can be computationally
|
||||
expensive, hence is default setting is False.
|
||||
**kwargs
|
||||
Additional keyword arguments passed to pyarrow.parquet.write_table().
|
||||
"""
|
||||
parquet = import_optional_dependency(
|
||||
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
|
||||
)
|
||||
|
||||
path = _expand_user(path)
|
||||
table = _geopandas_to_arrow(
|
||||
df,
|
||||
index=index,
|
||||
geometry_encoding=geometry_encoding,
|
||||
schema_version=schema_version,
|
||||
write_covering_bbox=write_covering_bbox,
|
||||
)
|
||||
parquet.write_table(table, path, compression=compression, **kwargs)
|
||||
|
||||
|
||||
def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
|
||||
"""
|
||||
Write a GeoDataFrame to the Feather format.
|
||||
|
||||
Any geometry columns present are serialized to WKB format in the file.
|
||||
|
||||
Requires 'pyarrow' >= 0.17.
|
||||
|
||||
This is tracking version 1.0.0 of the GeoParquet specification for
|
||||
the metadata at: https://github.com/opengeospatial/geoparquet. Writing
|
||||
older versions is supported using the `schema_version` keyword.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
index : bool, default None
|
||||
If ``True``, always include the dataframe's index(es) as columns
|
||||
in the file output.
|
||||
If ``False``, the index(es) will not be written to the file.
|
||||
If ``None``, the index(ex) will be included as columns in the file
|
||||
output except `RangeIndex` which is stored as metadata only.
|
||||
compression : {'zstd', 'lz4', 'uncompressed'}, optional
|
||||
Name of the compression to use. Use ``"uncompressed"`` for no
|
||||
compression. By default uses LZ4 if available, otherwise uncompressed.
|
||||
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
|
||||
GeoParquet specification version for the metadata; if not provided
|
||||
will default to latest supported version.
|
||||
kwargs
|
||||
Additional keyword arguments passed to pyarrow.feather.write_feather().
|
||||
"""
|
||||
feather = import_optional_dependency(
|
||||
"pyarrow.feather", extra="pyarrow is required for Feather support."
|
||||
)
|
||||
# TODO move this into `import_optional_dependency`
|
||||
import pyarrow
|
||||
|
||||
if Version(pyarrow.__version__) < Version("0.17.0"):
|
||||
raise ImportError("pyarrow >= 0.17 required for Feather support")
|
||||
|
||||
path = _expand_user(path)
|
||||
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
|
||||
feather.write_feather(table, path, compression=compression, **kwargs)
|
||||
|
||||
|
||||
def _arrow_to_geopandas(table, geo_metadata=None):
|
||||
"""
|
||||
Helper function with main, shared logic for read_parquet/read_feather.
|
||||
"""
|
||||
if geo_metadata is None:
|
||||
# Note: this path of not passing metadata is also used by dask-geopandas
|
||||
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
|
||||
|
||||
# Find all geometry columns that were read from the file. May
|
||||
# be a subset if 'columns' parameter is used.
|
||||
geometry_columns = [
|
||||
col for col in geo_metadata["columns"] if col in table.column_names
|
||||
]
|
||||
result_column_names = list(table.slice(0, 0).to_pandas().columns)
|
||||
geometry_columns.sort(key=result_column_names.index)
|
||||
|
||||
if not len(geometry_columns):
|
||||
raise ValueError(
|
||||
"""No geometry columns are included in the columns read from
|
||||
the Parquet/Feather file. To read this file without geometry columns,
|
||||
use pandas.read_parquet/read_feather() instead."""
|
||||
)
|
||||
|
||||
geometry = geo_metadata["primary_column"]
|
||||
|
||||
# Missing geometry likely indicates a subset of columns was read;
|
||||
# promote the first available geometry to the primary geometry.
|
||||
if len(geometry_columns) and geometry not in geometry_columns:
|
||||
geometry = geometry_columns[0]
|
||||
|
||||
# if there are multiple non-primary geometry columns, raise a warning
|
||||
if len(geometry_columns) > 1:
|
||||
warnings.warn(
|
||||
"Multiple non-primary geometry columns read from Parquet/Feather "
|
||||
"file. The first column read was promoted to the primary geometry.",
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
table_attr = table.drop(geometry_columns)
|
||||
df = table_attr.to_pandas()
|
||||
|
||||
# Convert the WKB columns that are present back to geometry.
|
||||
for col in geometry_columns:
|
||||
col_metadata = geo_metadata["columns"][col]
|
||||
if "crs" in col_metadata:
|
||||
crs = col_metadata["crs"]
|
||||
if isinstance(crs, dict):
|
||||
_remove_id_from_member_of_ensembles(crs)
|
||||
else:
|
||||
# per the GeoParquet spec, missing CRS is to be interpreted as
|
||||
# OGC:CRS84
|
||||
crs = "OGC:CRS84"
|
||||
|
||||
if col_metadata["encoding"] == "WKB":
|
||||
geom_arr = from_wkb(np.array(table[col]), crs=crs)
|
||||
else:
|
||||
from geopandas.io._geoarrow import construct_shapely_array
|
||||
|
||||
geom_arr = from_shapely(
|
||||
construct_shapely_array(
|
||||
table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
|
||||
),
|
||||
crs=crs,
|
||||
)
|
||||
|
||||
df.insert(result_column_names.index(col), col, geom_arr)
|
||||
|
||||
return GeoDataFrame(df, geometry=geometry)
|
||||
|
||||
|
||||
def _get_filesystem_path(path, filesystem=None, storage_options=None):
|
||||
"""
|
||||
Get the filesystem and path for a given filesystem and path.
|
||||
|
||||
If the filesystem is not None then it's just returned as is.
|
||||
"""
|
||||
import pyarrow
|
||||
|
||||
if (
|
||||
isinstance(path, str)
|
||||
and storage_options is None
|
||||
and filesystem is None
|
||||
and Version(pyarrow.__version__) >= Version("5.0.0")
|
||||
):
|
||||
# Use the native pyarrow filesystem if possible.
|
||||
try:
|
||||
from pyarrow.fs import FileSystem
|
||||
|
||||
filesystem, path = FileSystem.from_uri(path)
|
||||
except Exception:
|
||||
# fallback to use get_handle / fsspec for filesystems
|
||||
# that pyarrow doesn't support
|
||||
pass
|
||||
|
||||
if _is_fsspec_url(path) and filesystem is None:
|
||||
fsspec = import_optional_dependency(
|
||||
"fsspec", extra="fsspec is requred for 'storage_options'."
|
||||
)
|
||||
filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
|
||||
|
||||
if filesystem is None and storage_options:
|
||||
raise ValueError(
|
||||
"Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
|
||||
)
|
||||
|
||||
return filesystem, path
|
||||
|
||||
|
||||
def _ensure_arrow_fs(filesystem):
|
||||
"""
|
||||
Simplified version of pyarrow.fs._ensure_filesystem. This is only needed
|
||||
below because `pyarrow.parquet.read_metadata` does not yet accept a
|
||||
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
|
||||
"""
|
||||
from pyarrow import fs
|
||||
|
||||
if isinstance(filesystem, fs.FileSystem):
|
||||
return filesystem
|
||||
|
||||
# handle fsspec-compatible filesystems
|
||||
try:
|
||||
import fsspec
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
if isinstance(filesystem, fsspec.AbstractFileSystem):
|
||||
return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
|
||||
|
||||
return filesystem
|
||||
|
||||
|
||||
def _validate_and_decode_metadata(metadata):
|
||||
if metadata is None or b"geo" not in metadata:
|
||||
raise ValueError(
|
||||
"""Missing geo metadata in Parquet/Feather file.
|
||||
Use pandas.read_parquet/read_feather() instead."""
|
||||
)
|
||||
|
||||
# check for malformed metadata
|
||||
try:
|
||||
decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
|
||||
except (TypeError, json.decoder.JSONDecodeError):
|
||||
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
|
||||
|
||||
_validate_geo_metadata(decoded_geo_metadata)
|
||||
return decoded_geo_metadata
|
||||
|
||||
|
||||
def _read_parquet_schema_and_metadata(path, filesystem):
|
||||
"""
|
||||
Opening the Parquet file/dataset a first time to get the schema and metadata.
|
||||
|
||||
TODO: we should look into how we can reuse opened dataset for reading the
|
||||
actual data, to avoid discovering the dataset twice (problem right now is
|
||||
that the ParquetDataset interface doesn't allow passing the filters on read)
|
||||
|
||||
"""
|
||||
import pyarrow
|
||||
from pyarrow import parquet
|
||||
|
||||
kwargs = {}
|
||||
if Version(pyarrow.__version__) < Version("15.0.0"):
|
||||
kwargs = dict(use_legacy_dataset=False)
|
||||
|
||||
try:
|
||||
schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
|
||||
except Exception:
|
||||
schema = parquet.read_schema(path, filesystem=filesystem)
|
||||
|
||||
metadata = schema.metadata
|
||||
|
||||
# read metadata separately to get the raw Parquet FileMetaData metadata
|
||||
# (pyarrow doesn't properly exposes those in schema.metadata for files
|
||||
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
|
||||
if metadata is None or b"geo" not in metadata:
|
||||
try:
|
||||
metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return schema, metadata
|
||||
|
||||
|
||||
def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
|
||||
"""
|
||||
Load a Parquet object from the file path, returning a GeoDataFrame.
|
||||
|
||||
You can read a subset of columns in the file using the ``columns`` parameter.
|
||||
However, the structure of the returned GeoDataFrame will depend on which
|
||||
columns you read:
|
||||
|
||||
* if no geometry columns are read, this will raise a ``ValueError`` - you
|
||||
should use the pandas `read_parquet` method instead.
|
||||
* if the primary geometry column saved to this file is not included in
|
||||
columns, the first available geometry column will be set as the geometry
|
||||
column of the returned GeoDataFrame.
|
||||
|
||||
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
|
||||
specification at: https://github.com/opengeospatial/geoparquet
|
||||
|
||||
If 'crs' key is not present in the GeoParquet metadata associated with the
|
||||
Parquet object, it will default to "OGC:CRS84" according to the specification.
|
||||
|
||||
Requires 'pyarrow'.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
columns : list-like of strings, default=None
|
||||
If not None, only these columns will be read from the file. If
|
||||
the primary geometry column is not included, the first secondary
|
||||
geometry read from the file will be set as the geometry column
|
||||
of the returned GeoDataFrame. If no geometry columns are present,
|
||||
a ``ValueError`` will be raised.
|
||||
storage_options : dict, optional
|
||||
Extra options that make sense for a particular storage connection, e.g. host,
|
||||
port, username, password, etc. For HTTP(S) URLs the key-value pairs are
|
||||
forwarded to urllib as header options. For other URLs (e.g. starting with
|
||||
"s3://", and "gcs://") the key-value pairs are forwarded to fsspec. Please
|
||||
see fsspec and urllib for more details.
|
||||
|
||||
When no storage options are provided and a filesystem is implemented by
|
||||
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
|
||||
filesystem is preferred. Provide the instantiated fsspec filesystem using
|
||||
the ``filesystem`` keyword if you wish to use its implementation.
|
||||
bbox : tuple, optional
|
||||
Bounding box to be used to filter selection from geoparquet data. This
|
||||
is only usable if the data was saved with the bbox covering metadata.
|
||||
Input is of the tuple format (xmin, ymin, xmax, ymax).
|
||||
|
||||
**kwargs
|
||||
Any additional kwargs passed to :func:`pyarrow.parquet.read_table`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = geopandas.read_parquet("data.parquet") # doctest: +SKIP
|
||||
|
||||
Specifying columns to read:
|
||||
|
||||
>>> df = geopandas.read_parquet(
|
||||
... "data.parquet",
|
||||
... columns=["geometry", "pop_est"]
|
||||
... ) # doctest: +SKIP
|
||||
"""
|
||||
|
||||
parquet = import_optional_dependency(
|
||||
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
|
||||
)
|
||||
import geopandas.io._pyarrow_hotfix # noqa: F401
|
||||
|
||||
# TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
|
||||
# adds filesystem as a keyword and match that.
|
||||
filesystem = kwargs.pop("filesystem", None)
|
||||
filesystem, path = _get_filesystem_path(
|
||||
path, filesystem=filesystem, storage_options=storage_options
|
||||
)
|
||||
path = _expand_user(path)
|
||||
schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
|
||||
|
||||
geo_metadata = _validate_and_decode_metadata(metadata)
|
||||
|
||||
bbox_filter = (
|
||||
_get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
|
||||
)
|
||||
|
||||
if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
|
||||
|
||||
# by default, bbox column is not read in, so must specify which
|
||||
# columns are read in if it exists.
|
||||
if not columns and if_bbox_column_exists:
|
||||
columns = _get_non_bbox_columns(schema, geo_metadata)
|
||||
|
||||
# if both bbox and filters kwargs are used, must splice together.
|
||||
if "filters" in kwargs:
|
||||
filters_kwarg = kwargs.pop("filters")
|
||||
filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
|
||||
else:
|
||||
filters = bbox_filter
|
||||
|
||||
kwargs["use_pandas_metadata"] = True
|
||||
|
||||
table = parquet.read_table(
|
||||
path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
|
||||
)
|
||||
|
||||
return _arrow_to_geopandas(table, geo_metadata)
|
||||
|
||||
|
||||
def _read_feather(path, columns=None, **kwargs):
|
||||
"""
|
||||
Load a Feather object from the file path, returning a GeoDataFrame.
|
||||
|
||||
You can read a subset of columns in the file using the ``columns`` parameter.
|
||||
However, the structure of the returned GeoDataFrame will depend on which
|
||||
columns you read:
|
||||
|
||||
* if no geometry columns are read, this will raise a ``ValueError`` - you
|
||||
should use the pandas `read_feather` method instead.
|
||||
* if the primary geometry column saved to this file is not included in
|
||||
columns, the first available geometry column will be set as the geometry
|
||||
column of the returned GeoDataFrame.
|
||||
|
||||
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
|
||||
specification at: https://github.com/opengeospatial/geoparquet
|
||||
|
||||
If 'crs' key is not present in the Feather metadata associated with the
|
||||
Parquet object, it will default to "OGC:CRS84" according to the specification.
|
||||
|
||||
Requires 'pyarrow' >= 0.17.
|
||||
|
||||
.. versionadded:: 0.8
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str, path object
|
||||
columns : list-like of strings, default=None
|
||||
If not None, only these columns will be read from the file. If
|
||||
the primary geometry column is not included, the first secondary
|
||||
geometry read from the file will be set as the geometry column
|
||||
of the returned GeoDataFrame. If no geometry columns are present,
|
||||
a ``ValueError`` will be raised.
|
||||
**kwargs
|
||||
Any additional kwargs passed to pyarrow.feather.read_table().
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = geopandas.read_feather("data.feather") # doctest: +SKIP
|
||||
|
||||
Specifying columns to read:
|
||||
|
||||
>>> df = geopandas.read_feather(
|
||||
... "data.feather",
|
||||
... columns=["geometry", "pop_est"]
|
||||
... ) # doctest: +SKIP
|
||||
"""
|
||||
|
||||
feather = import_optional_dependency(
|
||||
"pyarrow.feather", extra="pyarrow is required for Feather support."
|
||||
)
|
||||
# TODO move this into `import_optional_dependency`
|
||||
import pyarrow
|
||||
|
||||
import geopandas.io._pyarrow_hotfix # noqa: F401
|
||||
|
||||
if Version(pyarrow.__version__) < Version("0.17.0"):
|
||||
raise ImportError("pyarrow >= 0.17 required for Feather support")
|
||||
|
||||
path = _expand_user(path)
|
||||
|
||||
table = feather.read_table(path, columns=columns, **kwargs)
|
||||
return _arrow_to_geopandas(table)
|
||||
|
||||
|
||||
def _get_parquet_bbox_filter(geo_metadata, bbox):
|
||||
primary_column = geo_metadata["primary_column"]
|
||||
|
||||
if _check_if_covering_in_geo_metadata(geo_metadata):
|
||||
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
|
||||
return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
|
||||
|
||||
elif geo_metadata["columns"][primary_column]["encoding"] == "point":
|
||||
import pyarrow.compute as pc
|
||||
|
||||
return (
|
||||
(pc.field((primary_column, "x")) >= bbox[0])
|
||||
& (pc.field((primary_column, "x")) <= bbox[2])
|
||||
& (pc.field((primary_column, "y")) >= bbox[1])
|
||||
& (pc.field((primary_column, "y")) <= bbox[3])
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
"Specifying 'bbox' not supported for this Parquet file (it should either "
|
||||
"have a bbox covering column or use 'point' encoding)."
|
||||
)
|
||||
|
||||
|
||||
def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
|
||||
import pyarrow.compute as pc
|
||||
|
||||
return ~(
|
||||
(pc.field((bbox_column_name, "xmin")) > bbox[2])
|
||||
| (pc.field((bbox_column_name, "ymin")) > bbox[3])
|
||||
| (pc.field((bbox_column_name, "xmax")) < bbox[0])
|
||||
| (pc.field((bbox_column_name, "ymax")) < bbox[1])
|
||||
)
|
||||
|
||||
|
||||
def _check_if_covering_in_geo_metadata(geo_metadata):
|
||||
primary_column = geo_metadata["primary_column"]
|
||||
return "covering" in geo_metadata["columns"][primary_column].keys()
|
||||
|
||||
|
||||
def _get_bbox_encoding_column_name(geo_metadata):
|
||||
primary_column = geo_metadata["primary_column"]
|
||||
return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
|
||||
|
||||
|
||||
def _get_non_bbox_columns(schema, geo_metadata):
|
||||
|
||||
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
|
||||
columns = schema.names
|
||||
if bbox_column_name in columns:
|
||||
columns.remove(bbox_column_name)
|
||||
return columns
|
||||
|
||||
|
||||
def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
|
||||
parquet = import_optional_dependency(
|
||||
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
|
||||
)
|
||||
if bbox_filter is None:
|
||||
return kwarg_filters
|
||||
|
||||
filters_expression = parquet.filters_to_expression(kwarg_filters)
|
||||
return bbox_filter & filters_expression
|
||||
@@ -0,0 +1,851 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import urllib.request
|
||||
import warnings
|
||||
from io import IOBase
|
||||
from packaging.version import Version
|
||||
from pathlib import Path
|
||||
|
||||
# Adapted from pandas.io.common
|
||||
from urllib.parse import urlparse as parse_url
|
||||
from urllib.parse import uses_netloc, uses_params, uses_relative
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pandas.api.types import is_integer_dtype
|
||||
|
||||
import shapely
|
||||
from shapely.geometry import mapping
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
|
||||
from geopandas import GeoDataFrame, GeoSeries
|
||||
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
|
||||
from geopandas.io.util import vsi_path
|
||||
|
||||
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
|
||||
_VALID_URLS.discard("")
|
||||
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
|
||||
_VALID_URLS.discard("file")
|
||||
|
||||
fiona = None
|
||||
fiona_env = None
|
||||
fiona_import_error = None
|
||||
FIONA_GE_19 = False
|
||||
|
||||
|
||||
def _import_fiona():
|
||||
global fiona
|
||||
global fiona_env
|
||||
global fiona_import_error
|
||||
global FIONA_GE_19
|
||||
|
||||
if fiona is None:
|
||||
try:
|
||||
import fiona
|
||||
|
||||
# only try to import fiona.Env if the main fiona import succeeded
|
||||
# (otherwise you can get confusing "AttributeError: module 'fiona'
|
||||
# has no attribute '_loading'" / partially initialized module errors)
|
||||
try:
|
||||
from fiona import Env as fiona_env
|
||||
except ImportError:
|
||||
try:
|
||||
from fiona import drivers as fiona_env
|
||||
except ImportError:
|
||||
fiona_env = None
|
||||
|
||||
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
|
||||
"1.9.0"
|
||||
)
|
||||
|
||||
except ImportError as err:
|
||||
fiona = False
|
||||
fiona_import_error = str(err)
|
||||
|
||||
|
||||
pyogrio = None
|
||||
pyogrio_import_error = None
|
||||
|
||||
|
||||
def _import_pyogrio():
|
||||
global pyogrio
|
||||
global pyogrio_import_error
|
||||
|
||||
if pyogrio is None:
|
||||
try:
|
||||
import pyogrio
|
||||
|
||||
except ImportError as err:
|
||||
pyogrio = False
|
||||
pyogrio_import_error = str(err)
|
||||
|
||||
|
||||
def _check_fiona(func):
|
||||
if not fiona:
|
||||
raise ImportError(
|
||||
f"the {func} requires the 'fiona' package, but it is not installed or does "
|
||||
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
|
||||
)
|
||||
|
||||
|
||||
def _check_pyogrio(func):
|
||||
if not pyogrio:
|
||||
raise ImportError(
|
||||
f"the {func} requires the 'pyogrio' package, but it is not installed "
|
||||
"or does not import correctly."
|
||||
"\nImporting pyogrio resulted in: {pyogrio_import_error}"
|
||||
)
|
||||
|
||||
|
||||
def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
|
||||
if metadata is None:
|
||||
return
|
||||
if driver != "GPKG":
|
||||
raise NotImplementedError(
|
||||
"The 'metadata' keyword is only supported for the GPKG driver."
|
||||
)
|
||||
|
||||
if engine == "fiona" and not FIONA_GE_19:
|
||||
raise NotImplementedError(
|
||||
"The 'metadata' keyword is only supported for Fiona >= 1.9."
|
||||
)
|
||||
|
||||
|
||||
def _check_engine(engine, func):
|
||||
# if not specified through keyword or option, then default to "pyogrio" if
|
||||
# installed, otherwise try fiona
|
||||
if engine is None:
|
||||
import geopandas
|
||||
|
||||
engine = geopandas.options.io_engine
|
||||
|
||||
if engine is None:
|
||||
_import_pyogrio()
|
||||
if pyogrio:
|
||||
engine = "pyogrio"
|
||||
else:
|
||||
_import_fiona()
|
||||
if fiona:
|
||||
engine = "fiona"
|
||||
|
||||
if engine == "pyogrio":
|
||||
_import_pyogrio()
|
||||
_check_pyogrio(func)
|
||||
elif engine == "fiona":
|
||||
_import_fiona()
|
||||
_check_fiona(func)
|
||||
elif engine is None:
|
||||
raise ImportError(
|
||||
f"The {func} requires the 'pyogrio' or 'fiona' package, "
|
||||
"but neither is installed or imports correctly."
|
||||
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
|
||||
f"\nImporting fiona resulted in: {fiona_import_error}"
|
||||
)
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
_EXTENSION_TO_DRIVER = {
|
||||
".bna": "BNA",
|
||||
".dxf": "DXF",
|
||||
".csv": "CSV",
|
||||
".shp": "ESRI Shapefile",
|
||||
".dbf": "ESRI Shapefile",
|
||||
".json": "GeoJSON",
|
||||
".geojson": "GeoJSON",
|
||||
".geojsonl": "GeoJSONSeq",
|
||||
".geojsons": "GeoJSONSeq",
|
||||
".gpkg": "GPKG",
|
||||
".gml": "GML",
|
||||
".xml": "GML",
|
||||
".gpx": "GPX",
|
||||
".gtm": "GPSTrackMaker",
|
||||
".gtz": "GPSTrackMaker",
|
||||
".tab": "MapInfo File",
|
||||
".mif": "MapInfo File",
|
||||
".mid": "MapInfo File",
|
||||
".dgn": "DGN",
|
||||
".fgb": "FlatGeobuf",
|
||||
}
|
||||
|
||||
|
||||
def _expand_user(path):
|
||||
"""Expand paths that use ~."""
|
||||
if isinstance(path, str):
|
||||
path = os.path.expanduser(path)
|
||||
elif isinstance(path, Path):
|
||||
path = path.expanduser()
|
||||
return path
|
||||
|
||||
|
||||
def _is_url(url):
|
||||
"""Check to see if *url* has a valid protocol."""
|
||||
try:
|
||||
return parse_url(url).scheme in _VALID_URLS
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _read_file(
|
||||
filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
|
||||
):
|
||||
"""
|
||||
Returns a GeoDataFrame from a file or URL.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str, path object or file-like object
|
||||
Either the absolute or relative path to the file or URL to
|
||||
be opened, or any object with a read() method (such as an open file
|
||||
or StringIO)
|
||||
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
|
||||
Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely
|
||||
geometry. With engine="fiona", CRS mis-matches are resolved if given a GeoSeries
|
||||
or GeoDataFrame. With engine="pyogrio", bbox must be in the same CRS as the
|
||||
dataset. Tuple is (minx, miny, maxx, maxy) to match the bounds property of
|
||||
shapely geometry objects. Cannot be used with mask.
|
||||
mask : dict | GeoDataFrame or GeoSeries | shapely Geometry, default None
|
||||
Filter for features that intersect with the given dict-like geojson
|
||||
geometry, GeoSeries, GeoDataFrame or shapely geometry.
|
||||
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
|
||||
Cannot be used with bbox. If multiple geometries are passed, this will
|
||||
first union all geometries, which may be computationally expensive.
|
||||
columns : list, optional
|
||||
List of column names to import from the data source. Column names
|
||||
must exactly match the names in the data source. To avoid reading
|
||||
any columns (besides the geometry column), pass an empty list-like.
|
||||
By default reads all columns.
|
||||
rows : int or slice, default None
|
||||
Load in specific rows by passing an integer (first `n` rows) or a
|
||||
slice() object.
|
||||
engine : str, "pyogrio" or "fiona"
|
||||
The underlying library that is used to read the file. Currently, the
|
||||
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
|
||||
installed, otherwise tries "fiona". Engine can also be set globally
|
||||
with the ``geopandas.options.io_engine`` option.
|
||||
**kwargs :
|
||||
Keyword args to be passed to the engine, and can be used to write
|
||||
to multi-layer data, store data within archives (zip files), etc.
|
||||
In case of the "pyogrio" engine, the keyword arguments are passed to
|
||||
`pyogrio.write_dataframe`. In case of the "fiona" engine, the keyword
|
||||
arguments are passed to fiona.open`. For more information on possible
|
||||
keywords, type: ``import pyogrio; help(pyogrio.write_dataframe)``.
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> df = geopandas.read_file("nybb.shp") # doctest: +SKIP
|
||||
|
||||
Specifying layer of GPKG:
|
||||
|
||||
>>> df = geopandas.read_file("file.gpkg", layer='cities') # doctest: +SKIP
|
||||
|
||||
Reading only first 10 rows:
|
||||
|
||||
>>> df = geopandas.read_file("nybb.shp", rows=10) # doctest: +SKIP
|
||||
|
||||
Reading only geometries intersecting ``mask``:
|
||||
|
||||
>>> df = geopandas.read_file("nybb.shp", mask=polygon) # doctest: +SKIP
|
||||
|
||||
Reading only geometries intersecting ``bbox``:
|
||||
|
||||
>>> df = geopandas.read_file("nybb.shp", bbox=(0, 0, 10, 20)) # doctest: +SKIP
|
||||
|
||||
Returns
|
||||
-------
|
||||
:obj:`geopandas.GeoDataFrame` or :obj:`pandas.DataFrame` :
|
||||
If `ignore_geometry=True` a :obj:`pandas.DataFrame` will be returned.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The format drivers will attempt to detect the encoding of your data, but
|
||||
may fail. In this case, the proper encoding can be specified explicitly
|
||||
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
|
||||
|
||||
When specifying a URL, geopandas will check if the server supports reading
|
||||
partial data and in that case pass the URL as is to the underlying engine,
|
||||
which will then use the network file system handler of GDAL to read from
|
||||
the URL. Otherwise geopandas will download the data from the URL and pass
|
||||
all data in-memory to the underlying engine.
|
||||
If you need more control over how the URL is read, you can specify the
|
||||
GDAL virtual filesystem manually (e.g. ``/vsicurl/https://...``). See the
|
||||
GDAL documentation on filesystems for more details
|
||||
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
|
||||
|
||||
"""
|
||||
engine = _check_engine(engine, "'read_file' function")
|
||||
|
||||
filename = _expand_user(filename)
|
||||
|
||||
from_bytes = False
|
||||
if _is_url(filename):
|
||||
# if it is a url that supports random access -> pass through to
|
||||
# pyogrio/fiona as is (to support downloading only part of the file)
|
||||
# otherwise still download manually because pyogrio/fiona don't support
|
||||
# all types of urls (https://github.com/geopandas/geopandas/issues/2908)
|
||||
with urllib.request.urlopen(filename) as response:
|
||||
if not response.headers.get("Accept-Ranges") == "bytes":
|
||||
filename = response.read()
|
||||
from_bytes = True
|
||||
|
||||
if engine == "pyogrio":
|
||||
return _read_file_pyogrio(
|
||||
filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
|
||||
)
|
||||
|
||||
elif engine == "fiona":
|
||||
if pd.api.types.is_file_like(filename):
|
||||
data = filename.read()
|
||||
path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
|
||||
from_bytes = True
|
||||
else:
|
||||
path_or_bytes = filename
|
||||
|
||||
return _read_file_fiona(
|
||||
path_or_bytes,
|
||||
from_bytes,
|
||||
bbox=bbox,
|
||||
mask=mask,
|
||||
columns=columns,
|
||||
rows=rows,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
else:
|
||||
raise ValueError(f"unknown engine '{engine}'")
|
||||
|
||||
|
||||
def _read_file_fiona(
|
||||
path_or_bytes,
|
||||
from_bytes,
|
||||
bbox=None,
|
||||
mask=None,
|
||||
columns=None,
|
||||
rows=None,
|
||||
where=None,
|
||||
**kwargs,
|
||||
):
|
||||
if where is not None and not FIONA_GE_19:
|
||||
raise NotImplementedError("where requires fiona 1.9+")
|
||||
|
||||
if columns is not None:
|
||||
if "include_fields" in kwargs:
|
||||
raise ValueError(
|
||||
"Cannot specify both 'include_fields' and 'columns' keywords"
|
||||
)
|
||||
if not FIONA_GE_19:
|
||||
raise NotImplementedError("'columns' keyword requires fiona 1.9+")
|
||||
kwargs["include_fields"] = columns
|
||||
elif "include_fields" in kwargs:
|
||||
# alias to columns, as this variable is used below to specify column order
|
||||
# in the dataframe creation
|
||||
columns = kwargs["include_fields"]
|
||||
|
||||
if not from_bytes:
|
||||
# Opening a file via URL or file-like-object above automatically detects a
|
||||
# zipped file. In order to match that behavior, attempt to add a zip scheme
|
||||
# if missing.
|
||||
path_or_bytes = vsi_path(str(path_or_bytes))
|
||||
|
||||
if from_bytes:
|
||||
reader = fiona.BytesCollection
|
||||
else:
|
||||
reader = fiona.open
|
||||
|
||||
with fiona_env():
|
||||
with reader(path_or_bytes, **kwargs) as features:
|
||||
crs = features.crs_wkt
|
||||
# attempt to get EPSG code
|
||||
try:
|
||||
# fiona 1.9+
|
||||
epsg = features.crs.to_epsg(confidence_threshold=100)
|
||||
if epsg is not None:
|
||||
crs = epsg
|
||||
except AttributeError:
|
||||
# fiona <= 1.8
|
||||
try:
|
||||
crs = features.crs["init"]
|
||||
except (TypeError, KeyError):
|
||||
pass
|
||||
|
||||
# handle loading the bounding box
|
||||
if bbox is not None:
|
||||
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
|
||||
bbox = tuple(bbox.to_crs(crs).total_bounds)
|
||||
elif isinstance(bbox, BaseGeometry):
|
||||
bbox = bbox.bounds
|
||||
assert len(bbox) == 4
|
||||
# handle loading the mask
|
||||
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
|
||||
mask = mapping(mask.to_crs(crs).union_all())
|
||||
elif isinstance(mask, BaseGeometry):
|
||||
mask = mapping(mask)
|
||||
|
||||
filters = {}
|
||||
if bbox is not None:
|
||||
filters["bbox"] = bbox
|
||||
if mask is not None:
|
||||
filters["mask"] = mask
|
||||
if where is not None:
|
||||
filters["where"] = where
|
||||
|
||||
# setup the data loading filter
|
||||
if rows is not None:
|
||||
if isinstance(rows, int):
|
||||
rows = slice(rows)
|
||||
elif not isinstance(rows, slice):
|
||||
raise TypeError("'rows' must be an integer or a slice.")
|
||||
f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
|
||||
elif filters:
|
||||
f_filt = features.filter(**filters)
|
||||
else:
|
||||
f_filt = features
|
||||
# get list of columns
|
||||
columns = columns or list(features.schema["properties"])
|
||||
datetime_fields = [
|
||||
k for (k, v) in features.schema["properties"].items() if v == "datetime"
|
||||
]
|
||||
if (
|
||||
kwargs.get("ignore_geometry", False)
|
||||
or features.schema["geometry"] == "None"
|
||||
):
|
||||
df = pd.DataFrame(
|
||||
[record["properties"] for record in f_filt], columns=columns
|
||||
)
|
||||
else:
|
||||
df = GeoDataFrame.from_features(
|
||||
f_filt, crs=crs, columns=columns + ["geometry"]
|
||||
)
|
||||
for k in datetime_fields:
|
||||
as_dt = None
|
||||
# plain try catch for when pandas will raise in the future
|
||||
# TODO we can tighten the exception type in future when it does
|
||||
try:
|
||||
with warnings.catch_warnings():
|
||||
# pandas 2.x does not yet enforce this behaviour but raises a
|
||||
# warning -> we want to to suppress this warning for our users,
|
||||
# and do this by turning it into an error so we take the
|
||||
# `except` code path to try again with utc=True
|
||||
warnings.filterwarnings(
|
||||
"error",
|
||||
"In a future version of pandas, parsing datetimes with "
|
||||
"mixed time zones will raise an error",
|
||||
FutureWarning,
|
||||
)
|
||||
as_dt = pd.to_datetime(df[k])
|
||||
except Exception:
|
||||
pass
|
||||
if as_dt is None or as_dt.dtype == "object":
|
||||
# if to_datetime failed, try again for mixed timezone offsets
|
||||
# This can still fail if there are invalid datetimes
|
||||
try:
|
||||
as_dt = pd.to_datetime(df[k], utc=True)
|
||||
except Exception:
|
||||
pass
|
||||
# if to_datetime succeeded, round datetimes as
|
||||
# fiona only supports up to ms precision (any microseconds are
|
||||
# floating point rounding error)
|
||||
if as_dt is not None and not (as_dt.dtype == "object"):
|
||||
if PANDAS_GE_20:
|
||||
df[k] = as_dt.dt.as_unit("ms")
|
||||
else:
|
||||
df[k] = as_dt.dt.round(freq="ms")
|
||||
return df
|
||||
|
||||
|
||||
def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
|
||||
import pyogrio
|
||||
|
||||
if rows is not None:
|
||||
if isinstance(rows, int):
|
||||
kwargs["max_features"] = rows
|
||||
elif isinstance(rows, slice):
|
||||
if rows.start is not None:
|
||||
if rows.start < 0:
|
||||
raise ValueError(
|
||||
"Negative slice start not supported with the 'pyogrio' engine."
|
||||
)
|
||||
kwargs["skip_features"] = rows.start
|
||||
if rows.stop is not None:
|
||||
kwargs["max_features"] = rows.stop - (rows.start or 0)
|
||||
if rows.step is not None:
|
||||
raise ValueError("slice with step is not supported")
|
||||
else:
|
||||
raise TypeError("'rows' must be an integer or a slice.")
|
||||
|
||||
if bbox is not None and mask is not None:
|
||||
# match error message from Fiona
|
||||
raise ValueError("mask and bbox can not be set together")
|
||||
|
||||
if bbox is not None:
|
||||
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
|
||||
crs = pyogrio.read_info(path_or_bytes).get("crs")
|
||||
if isinstance(path_or_bytes, IOBase):
|
||||
path_or_bytes.seek(0)
|
||||
|
||||
bbox = tuple(bbox.to_crs(crs).total_bounds)
|
||||
elif isinstance(bbox, BaseGeometry):
|
||||
bbox = bbox.bounds
|
||||
if len(bbox) != 4:
|
||||
raise ValueError("'bbox' should be a length-4 tuple.")
|
||||
|
||||
if mask is not None:
|
||||
# NOTE: mask cannot be used at same time as bbox keyword
|
||||
if isinstance(mask, (GeoDataFrame, GeoSeries)):
|
||||
crs = pyogrio.read_info(path_or_bytes).get("crs")
|
||||
if isinstance(path_or_bytes, IOBase):
|
||||
path_or_bytes.seek(0)
|
||||
|
||||
mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
|
||||
elif isinstance(mask, BaseGeometry):
|
||||
mask = shapely.unary_union(mask)
|
||||
elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
|
||||
# convert GeoJSON to shapely geometry
|
||||
mask = shapely.geometry.shape(mask)
|
||||
|
||||
kwargs["mask"] = mask
|
||||
|
||||
if kwargs.pop("ignore_geometry", False):
|
||||
kwargs["read_geometry"] = False
|
||||
|
||||
# translate `ignore_fields`/`include_fields` keyword for back compat with fiona
|
||||
if "ignore_fields" in kwargs and "include_fields" in kwargs:
|
||||
raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
|
||||
elif "ignore_fields" in kwargs:
|
||||
if kwargs.get("columns", None) is not None:
|
||||
raise ValueError(
|
||||
"Cannot specify both 'columns' and 'ignore_fields' keywords"
|
||||
)
|
||||
warnings.warn(
|
||||
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
|
||||
"will be removed in a future release. You can use the 'columns' keyword "
|
||||
"instead to select which columns to read.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
ignore_fields = kwargs.pop("ignore_fields")
|
||||
fields = pyogrio.read_info(path_or_bytes)["fields"]
|
||||
include_fields = [col for col in fields if col not in ignore_fields]
|
||||
kwargs["columns"] = include_fields
|
||||
elif "include_fields" in kwargs:
|
||||
# translate `include_fields` keyword for back compat with fiona engine
|
||||
if kwargs.get("columns", None) is not None:
|
||||
raise ValueError(
|
||||
"Cannot specify both 'columns' and 'include_fields' keywords"
|
||||
)
|
||||
warnings.warn(
|
||||
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
|
||||
"will be removed in a future release. You can use the 'columns' keyword "
|
||||
"instead to select which columns to read.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
kwargs["columns"] = kwargs.pop("include_fields")
|
||||
|
||||
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
|
||||
|
||||
|
||||
def _detect_driver(path):
|
||||
"""
|
||||
Attempt to auto-detect driver based on the extension
|
||||
"""
|
||||
try:
|
||||
# in case the path is a file handle
|
||||
path = path.name
|
||||
except AttributeError:
|
||||
pass
|
||||
try:
|
||||
return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
|
||||
except KeyError:
|
||||
# Assume it is a shapefile folder for now. In the future,
|
||||
# will likely raise an exception when the expected
|
||||
# folder writing behavior is more clearly defined.
|
||||
return "ESRI Shapefile"
|
||||
|
||||
|
||||
def _to_file(
|
||||
df,
|
||||
filename,
|
||||
driver=None,
|
||||
schema=None,
|
||||
index=None,
|
||||
mode="w",
|
||||
crs=None,
|
||||
engine=None,
|
||||
metadata=None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Write this GeoDataFrame to an OGR data source
|
||||
|
||||
A dictionary of supported OGR providers is available via:
|
||||
|
||||
>>> import pyogrio
|
||||
>>> pyogrio.list_drivers() # doctest: +SKIP
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : GeoDataFrame to be written
|
||||
filename : string
|
||||
File path or file handle to write to. The path may specify a
|
||||
GDAL VSI scheme.
|
||||
driver : string, default None
|
||||
The OGR format driver used to write the vector file.
|
||||
If not specified, it attempts to infer it from the file extension.
|
||||
If no extension is specified, it saves ESRI Shapefile to a folder.
|
||||
schema : dict, default None
|
||||
If specified, the schema dictionary is passed to Fiona to
|
||||
better control how the file is written. If None, GeoPandas
|
||||
will determine the schema based on each column's dtype.
|
||||
Not supported for the "pyogrio" engine.
|
||||
index : bool, default None
|
||||
If True, write index into one or more columns (for MultiIndex).
|
||||
Default None writes the index into one or more columns only if
|
||||
the index is named, is a MultiIndex, or has a non-integer data
|
||||
type. If False, no index is written.
|
||||
|
||||
.. versionadded:: 0.7
|
||||
Previously the index was not written.
|
||||
mode : string, default 'w'
|
||||
The write mode, 'w' to overwrite the existing file and 'a' to append;
|
||||
when using the pyogrio engine, you can also pass ``append=True``.
|
||||
Not all drivers support appending. For the fiona engine, the drivers
|
||||
that support appending are listed in fiona.supported_drivers or
|
||||
https://github.com/Toblerity/Fiona/blob/master/fiona/drvsupport.py.
|
||||
For the pyogrio engine, you should be able to use any driver that
|
||||
is available in your installation of GDAL that supports append
|
||||
capability; see the specific driver entry at
|
||||
https://gdal.org/drivers/vector/index.html for more information.
|
||||
crs : pyproj.CRS, default None
|
||||
If specified, the CRS is passed to Fiona to
|
||||
better control how the file is written. If None, GeoPandas
|
||||
will determine the crs based on crs df attribute.
|
||||
The value can be anything accepted
|
||||
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
|
||||
such as an authority string (eg "EPSG:4326") or a WKT string.
|
||||
engine : str, "pyogrio" or "fiona"
|
||||
The underlying library that is used to read the file. Currently, the
|
||||
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
|
||||
installed, otherwise tries "fiona". Engine can also be set globally
|
||||
with the ``geopandas.options.io_engine`` option.
|
||||
metadata : dict[str, str], default None
|
||||
Optional metadata to be stored in the file. Keys and values must be
|
||||
strings. Only supported for the "GPKG" driver
|
||||
(requires Fiona >= 1.9 or pyogrio >= 0.6).
|
||||
**kwargs :
|
||||
Keyword args to be passed to the engine, and can be used to write
|
||||
to multi-layer data, store data within archives (zip files), etc.
|
||||
In case of the "fiona" engine, the keyword arguments are passed to
|
||||
fiona.open`. For more information on possible keywords, type:
|
||||
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
|
||||
the keyword arguments are passed to `pyogrio.write_dataframe`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The format drivers will attempt to detect the encoding of your data, but
|
||||
may fail. In this case, the proper encoding can be specified explicitly
|
||||
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
|
||||
"""
|
||||
engine = _check_engine(engine, "'to_file' method")
|
||||
|
||||
filename = _expand_user(filename)
|
||||
|
||||
if index is None:
|
||||
# Determine if index attribute(s) should be saved to file
|
||||
# (only if they are named or are non-integer)
|
||||
index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
|
||||
if index:
|
||||
df = df.reset_index(drop=False)
|
||||
|
||||
if driver is None:
|
||||
driver = _detect_driver(filename)
|
||||
|
||||
if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
|
||||
warnings.warn(
|
||||
"Column names longer than 10 characters will be truncated when saved to "
|
||||
"ESRI Shapefile.",
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
if (df.dtypes == "geometry").sum() > 1:
|
||||
raise ValueError(
|
||||
"GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
|
||||
"supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
|
||||
"GeoDataFrame.to_feather, drop additional geometry columns or convert them "
|
||||
"to a supported format like a well-known text (WKT) using "
|
||||
"`GeoSeries.to_wkt()`.",
|
||||
)
|
||||
_check_metadata_supported(metadata, engine, driver)
|
||||
|
||||
if mode not in ("w", "a"):
|
||||
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
|
||||
|
||||
if engine == "pyogrio":
|
||||
_to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
|
||||
elif engine == "fiona":
|
||||
_to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"unknown engine '{engine}'")
|
||||
|
||||
|
||||
def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
|
||||
if not HAS_PYPROJ and crs:
|
||||
raise ImportError(
|
||||
"The 'pyproj' package is required to write a file with a CRS, but it is not"
|
||||
" installed or does not import correctly."
|
||||
)
|
||||
|
||||
if schema is None:
|
||||
schema = infer_schema(df)
|
||||
|
||||
if crs:
|
||||
from pyproj import CRS
|
||||
|
||||
crs = CRS.from_user_input(crs)
|
||||
else:
|
||||
crs = df.crs
|
||||
|
||||
with fiona_env():
|
||||
crs_wkt = None
|
||||
try:
|
||||
gdal_version = Version(
|
||||
fiona.env.get_gdal_release_name().strip("e")
|
||||
) # GH3147
|
||||
except (AttributeError, ValueError):
|
||||
gdal_version = Version("2.0.0") # just assume it is not the latest
|
||||
if gdal_version >= Version("3.0.0") and crs:
|
||||
crs_wkt = crs.to_wkt()
|
||||
elif crs:
|
||||
crs_wkt = crs.to_wkt("WKT1_GDAL")
|
||||
with fiona.open(
|
||||
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
|
||||
) as colxn:
|
||||
if metadata is not None:
|
||||
colxn.update_tags(metadata)
|
||||
colxn.writerecords(df.iterfeatures())
|
||||
|
||||
|
||||
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
|
||||
import pyogrio
|
||||
|
||||
if schema is not None:
|
||||
raise ValueError(
|
||||
"The 'schema' argument is not supported with the 'pyogrio' engine."
|
||||
)
|
||||
|
||||
if mode == "a":
|
||||
kwargs["append"] = True
|
||||
|
||||
if crs is not None:
|
||||
raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
|
||||
|
||||
# for the fiona engine, this check is done in gdf.iterfeatures()
|
||||
if not df.columns.is_unique:
|
||||
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
|
||||
|
||||
pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
|
||||
|
||||
|
||||
def infer_schema(df):
|
||||
from collections import OrderedDict
|
||||
|
||||
# TODO: test pandas string type and boolean type once released
|
||||
types = {
|
||||
"Int32": "int32",
|
||||
"int32": "int32",
|
||||
"Int64": "int",
|
||||
"string": "str",
|
||||
"boolean": "bool",
|
||||
}
|
||||
|
||||
def convert_type(column, in_type):
|
||||
if in_type == object:
|
||||
return "str"
|
||||
if in_type.name.startswith("datetime64"):
|
||||
# numpy datetime type regardless of frequency
|
||||
return "datetime"
|
||||
if str(in_type) in types:
|
||||
out_type = types[str(in_type)]
|
||||
else:
|
||||
out_type = type(np.zeros(1, in_type).item()).__name__
|
||||
if out_type == "long":
|
||||
out_type = "int"
|
||||
return out_type
|
||||
|
||||
properties = OrderedDict(
|
||||
[
|
||||
(col, convert_type(col, _type))
|
||||
for col, _type in zip(df.columns, df.dtypes)
|
||||
if col != df._geometry_column_name
|
||||
]
|
||||
)
|
||||
|
||||
if df.empty:
|
||||
warnings.warn(
|
||||
"You are attempting to write an empty DataFrame to file. "
|
||||
"For some drivers, this operation may fail.",
|
||||
UserWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
# Since https://github.com/Toblerity/Fiona/issues/446 resolution,
|
||||
# Fiona allows a list of geometry types
|
||||
geom_types = _geometry_types(df)
|
||||
|
||||
schema = {"geometry": geom_types, "properties": properties}
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
def _geometry_types(df):
|
||||
"""
|
||||
Determine the geometry types in the GeoDataFrame for the schema.
|
||||
"""
|
||||
geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
|
||||
geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
|
||||
geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
|
||||
geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
|
||||
geom_types = geom_types_3D + geom_types_2D
|
||||
|
||||
if len(geom_types) == 0:
|
||||
# Default geometry type supported by Fiona
|
||||
# (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
|
||||
return "Unknown"
|
||||
|
||||
if len(geom_types) == 1:
|
||||
geom_types = geom_types[0]
|
||||
|
||||
return geom_types
|
||||
|
||||
|
||||
def _list_layers(filename) -> pd.DataFrame:
|
||||
"""List layers available in a file.
|
||||
|
||||
Provides an overview of layers available in a file or URL together with their
|
||||
geometry types. When supported by the data source, this includes both spatial and
|
||||
non-spatial layers. Non-spatial layers are indicated by the ``"geometry_type"``
|
||||
column being ``None``. GeoPandas will not read such layers but they can be read into
|
||||
a pd.DataFrame using :func:`pyogrio.read_dataframe`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filename : str, path object or file-like object
|
||||
Either the absolute or relative path to the file or URL to
|
||||
be opened, or any object with a read() method (such as an open file
|
||||
or StringIO)
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
A DataFrame with columns "name" and "geometry_type" and one row per layer.
|
||||
"""
|
||||
_import_pyogrio()
|
||||
_check_pyogrio("list_layers")
|
||||
|
||||
import pyogrio
|
||||
|
||||
return pd.DataFrame(
|
||||
pyogrio.list_layers(filename), columns=["name", "geometry_type"]
|
||||
)
|
||||
@@ -0,0 +1,473 @@
|
||||
import warnings
|
||||
from contextlib import contextmanager
|
||||
from functools import lru_cache
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import shapely
|
||||
import shapely.wkb
|
||||
|
||||
from geopandas import GeoDataFrame
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _get_conn(conn_or_engine):
|
||||
"""
|
||||
Yield a connection within a transaction context.
|
||||
|
||||
Engine.begin() returns a Connection with an implicit Transaction while
|
||||
Connection.begin() returns the Transaction. This helper will always return a
|
||||
Connection with an implicit (possibly nested) Transaction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
conn_or_engine : Connection or Engine
|
||||
A sqlalchemy Connection or Engine instance
|
||||
Returns
|
||||
-------
|
||||
Connection
|
||||
"""
|
||||
from sqlalchemy.engine.base import Connection, Engine
|
||||
|
||||
if isinstance(conn_or_engine, Connection):
|
||||
if not conn_or_engine.in_transaction():
|
||||
with conn_or_engine.begin():
|
||||
yield conn_or_engine
|
||||
else:
|
||||
yield conn_or_engine
|
||||
elif isinstance(conn_or_engine, Engine):
|
||||
with conn_or_engine.begin() as conn:
|
||||
yield conn
|
||||
else:
|
||||
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
|
||||
|
||||
|
||||
def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
|
||||
"""
|
||||
Transforms a pandas DataFrame into a GeoDataFrame.
|
||||
The column 'geom_col' must be a geometry column in WKB representation.
|
||||
To be used to convert df based on pd.read_sql to gdf.
|
||||
Parameters
|
||||
----------
|
||||
df : DataFrame
|
||||
pandas DataFrame with geometry column in WKB representation.
|
||||
geom_col : string, default 'geom'
|
||||
column name to convert to shapely geometries
|
||||
crs : pyproj.CRS, optional
|
||||
CRS to use for the returned GeoDataFrame. The value can be anything accepted
|
||||
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
|
||||
such as an authority string (eg "EPSG:4326") or a WKT string.
|
||||
If not set, tries to determine CRS from the SRID associated with the
|
||||
first geometry in the database, and assigns that to all geometries.
|
||||
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
|
||||
Active connection to the database to query.
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
"""
|
||||
|
||||
if geom_col not in df:
|
||||
raise ValueError("Query missing geometry column '{}'".format(geom_col))
|
||||
|
||||
if df.columns.to_list().count(geom_col) > 1:
|
||||
raise ValueError(
|
||||
f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
|
||||
"one geometry column is allowed."
|
||||
)
|
||||
|
||||
geoms = df[geom_col].dropna()
|
||||
|
||||
if not geoms.empty:
|
||||
load_geom_bytes = shapely.wkb.loads
|
||||
"""Load from Python 3 binary."""
|
||||
|
||||
def load_geom_text(x):
|
||||
"""Load from binary encoded as text."""
|
||||
return shapely.wkb.loads(str(x), hex=True)
|
||||
|
||||
if isinstance(geoms.iat[0], bytes):
|
||||
load_geom = load_geom_bytes
|
||||
else:
|
||||
load_geom = load_geom_text
|
||||
|
||||
df[geom_col] = geoms = geoms.apply(load_geom)
|
||||
if crs is None:
|
||||
srid = shapely.get_srid(geoms.iat[0])
|
||||
# if no defined SRID in geodatabase, returns SRID of 0
|
||||
if srid != 0:
|
||||
try:
|
||||
spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
|
||||
except pd.errors.DatabaseError:
|
||||
warning_msg = (
|
||||
f"Could not find the spatial reference system table "
|
||||
f"(spatial_ref_sys) in PostGIS."
|
||||
f"Trying epsg:{srid} as a fallback."
|
||||
)
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=3)
|
||||
crs = "epsg:{}".format(srid)
|
||||
else:
|
||||
if not spatial_ref_sys_df.empty:
|
||||
auth_name = spatial_ref_sys_df["auth_name"].item()
|
||||
crs = f"{auth_name}:{srid}"
|
||||
else:
|
||||
warning_msg = (
|
||||
f"Could not find srid {srid} in the "
|
||||
f"spatial_ref_sys table. "
|
||||
f"Trying epsg:{srid} as a fallback."
|
||||
)
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=3)
|
||||
crs = "epsg:{}".format(srid)
|
||||
|
||||
return GeoDataFrame(df, crs=crs, geometry=geom_col)
|
||||
|
||||
|
||||
def _read_postgis(
|
||||
sql,
|
||||
con,
|
||||
geom_col="geom",
|
||||
crs=None,
|
||||
index_col=None,
|
||||
coerce_float=True,
|
||||
parse_dates=None,
|
||||
params=None,
|
||||
chunksize=None,
|
||||
):
|
||||
"""
|
||||
Returns a GeoDataFrame corresponding to the result of the query
|
||||
string, which must contain a geometry column in WKB representation.
|
||||
|
||||
It is also possible to use :meth:`~GeoDataFrame.read_file` to read from a database.
|
||||
Especially for file geodatabases like GeoPackage or SpatiaLite this can be easier.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sql : string
|
||||
SQL query to execute in selecting entries from database, or name
|
||||
of the table to read from the database.
|
||||
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
|
||||
Active connection to the database to query.
|
||||
geom_col : string, default 'geom'
|
||||
column name to convert to shapely geometries
|
||||
crs : dict or str, optional
|
||||
CRS to use for the returned GeoDataFrame; if not set, tries to
|
||||
determine CRS from the SRID associated with the first geometry in
|
||||
the database, and assigns that to all geometries.
|
||||
chunksize : int, default None
|
||||
If specified, return an iterator where chunksize is the number of rows to
|
||||
include in each chunk.
|
||||
|
||||
See the documentation for pandas.read_sql for further explanation
|
||||
of the following parameters:
|
||||
index_col, coerce_float, parse_dates, params, chunksize
|
||||
|
||||
Returns
|
||||
-------
|
||||
GeoDataFrame
|
||||
|
||||
Examples
|
||||
--------
|
||||
PostGIS
|
||||
|
||||
>>> from sqlalchemy import create_engine # doctest: +SKIP
|
||||
>>> db_connection_url = "postgresql://myusername:mypassword@myhost:5432/mydatabase"
|
||||
>>> con = create_engine(db_connection_url) # doctest: +SKIP
|
||||
>>> sql = "SELECT geom, highway FROM roads"
|
||||
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
|
||||
|
||||
SpatiaLite
|
||||
|
||||
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
|
||||
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
|
||||
"""
|
||||
|
||||
if chunksize is None:
|
||||
# read all in one chunk and return a single GeoDataFrame
|
||||
df = pd.read_sql(
|
||||
sql,
|
||||
con,
|
||||
index_col=index_col,
|
||||
coerce_float=coerce_float,
|
||||
parse_dates=parse_dates,
|
||||
params=params,
|
||||
chunksize=chunksize,
|
||||
)
|
||||
return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
|
||||
|
||||
else:
|
||||
# read data in chunks and return a generator
|
||||
df_generator = pd.read_sql(
|
||||
sql,
|
||||
con,
|
||||
index_col=index_col,
|
||||
coerce_float=coerce_float,
|
||||
parse_dates=parse_dates,
|
||||
params=params,
|
||||
chunksize=chunksize,
|
||||
)
|
||||
return (
|
||||
_df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
|
||||
)
|
||||
|
||||
|
||||
def _get_geometry_type(gdf):
|
||||
"""
|
||||
Get basic geometry type of a GeoDataFrame. See more info from:
|
||||
https://geoalchemy-2.readthedocs.io/en/latest/types.html#geoalchemy2.types._GISType
|
||||
|
||||
Following rules apply:
|
||||
- if geometries all share the same geometry-type,
|
||||
geometries are inserted with the given GeometryType with following types:
|
||||
- Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon,
|
||||
GeometryCollection.
|
||||
- LinearRing geometries will be converted into LineString -objects.
|
||||
- in all other cases, geometries will be inserted with type GEOMETRY:
|
||||
- a mix of Polygons and MultiPolygons in GeoSeries
|
||||
- a mix of Points and LineStrings in GeoSeries
|
||||
- geometry is of type GeometryCollection,
|
||||
such as GeometryCollection([Point, LineStrings])
|
||||
- if any of the geometries has Z-coordinate, all records will
|
||||
be written with 3D.
|
||||
"""
|
||||
geom_types = list(gdf.geometry.geom_type.unique())
|
||||
has_curve = False
|
||||
|
||||
for gt in geom_types:
|
||||
if gt is None:
|
||||
continue
|
||||
elif "LinearRing" in gt:
|
||||
has_curve = True
|
||||
|
||||
if len(geom_types) == 1:
|
||||
if has_curve:
|
||||
target_geom_type = "LINESTRING"
|
||||
else:
|
||||
if geom_types[0] is None:
|
||||
raise ValueError("No valid geometries in the data.")
|
||||
else:
|
||||
target_geom_type = geom_types[0].upper()
|
||||
else:
|
||||
target_geom_type = "GEOMETRY"
|
||||
|
||||
# Check for 3D-coordinates
|
||||
if any(gdf.geometry.has_z):
|
||||
target_geom_type += "Z"
|
||||
|
||||
return target_geom_type, has_curve
|
||||
|
||||
|
||||
def _get_srid_from_crs(gdf):
|
||||
"""
|
||||
Get EPSG code from CRS if available. If not, return 0.
|
||||
"""
|
||||
|
||||
# Use geoalchemy2 default for srid
|
||||
# Note: undefined srid in PostGIS is 0
|
||||
srid = None
|
||||
warning_msg = (
|
||||
"Could not parse CRS from the GeoDataFrame. "
|
||||
"Inserting data without defined CRS."
|
||||
)
|
||||
if gdf.crs is not None:
|
||||
try:
|
||||
for confidence in (100, 70, 25):
|
||||
srid = gdf.crs.to_epsg(min_confidence=confidence)
|
||||
if srid is not None:
|
||||
break
|
||||
auth_srid = gdf.crs.to_authority(
|
||||
auth_name="ESRI", min_confidence=confidence
|
||||
)
|
||||
if auth_srid is not None:
|
||||
srid = int(auth_srid[1])
|
||||
break
|
||||
except Exception:
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=2)
|
||||
|
||||
if srid is None:
|
||||
srid = 0
|
||||
warnings.warn(warning_msg, UserWarning, stacklevel=2)
|
||||
|
||||
return srid
|
||||
|
||||
|
||||
def _convert_linearring_to_linestring(gdf, geom_name):
|
||||
from shapely.geometry import LineString
|
||||
|
||||
# Todo: Use shapely function once it's implemented:
|
||||
# https://github.com/shapely/shapely/issues/1617
|
||||
|
||||
mask = gdf.geom_type == "LinearRing"
|
||||
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
|
||||
lambda geom: LineString(geom)
|
||||
)
|
||||
return gdf
|
||||
|
||||
|
||||
def _convert_to_ewkb(gdf, geom_name, srid):
|
||||
"""Convert geometries to ewkb."""
|
||||
geoms = shapely.to_wkb(
|
||||
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
|
||||
hex=True,
|
||||
include_srid=True,
|
||||
)
|
||||
|
||||
# The gdf will warn that the geometry column doesn't hold in-memory geometries
|
||||
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
|
||||
# the user that the dtypes are unexpected.
|
||||
df = pd.DataFrame(gdf, copy=False)
|
||||
df[geom_name] = geoms
|
||||
return df
|
||||
|
||||
|
||||
def _psql_insert_copy(tbl, conn, keys, data_iter):
|
||||
import csv
|
||||
import io
|
||||
|
||||
s_buf = io.StringIO()
|
||||
writer = csv.writer(s_buf)
|
||||
writer.writerows(data_iter)
|
||||
s_buf.seek(0)
|
||||
|
||||
columns = ", ".join('"{}"'.format(k) for k in keys)
|
||||
|
||||
dbapi_conn = conn.connection
|
||||
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
|
||||
tbl.table.schema, tbl.table.name, columns
|
||||
)
|
||||
with dbapi_conn.cursor() as cur:
|
||||
# Use psycopg method if it's available
|
||||
if hasattr(cur, "copy") and callable(cur.copy):
|
||||
with cur.copy(sql) as copy:
|
||||
copy.write(s_buf.read())
|
||||
else: # otherwise use psycopg2 method
|
||||
cur.copy_expert(sql, s_buf)
|
||||
|
||||
|
||||
def _write_postgis(
|
||||
gdf,
|
||||
name,
|
||||
con,
|
||||
schema=None,
|
||||
if_exists="fail",
|
||||
index=False,
|
||||
index_label=None,
|
||||
chunksize=None,
|
||||
dtype=None,
|
||||
):
|
||||
"""
|
||||
Upload GeoDataFrame into PostGIS database.
|
||||
|
||||
This method requires SQLAlchemy and GeoAlchemy2, and a PostgreSQL
|
||||
Python driver (e.g. psycopg2) to be installed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
name : str
|
||||
Name of the target table.
|
||||
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
|
||||
Active connection to the PostGIS database.
|
||||
if_exists : {'fail', 'replace', 'append'}, default 'fail'
|
||||
How to behave if the table already exists:
|
||||
|
||||
- fail: Raise a ValueError.
|
||||
- replace: Drop the table before inserting new values.
|
||||
- append: Insert new values to the existing table.
|
||||
schema : string, optional
|
||||
Specify the schema. If None, use default schema: 'public'.
|
||||
index : bool, default True
|
||||
Write DataFrame index as a column.
|
||||
Uses *index_label* as the column name in the table.
|
||||
index_label : string or sequence, default None
|
||||
Column label for index column(s).
|
||||
If None is given (default) and index is True,
|
||||
then the index names are used.
|
||||
chunksize : int, optional
|
||||
Rows will be written in batches of this size at a time.
|
||||
By default, all rows will be written at once.
|
||||
dtype : dict of column name to SQL type, default None
|
||||
Specifying the datatype for columns.
|
||||
The keys should be the column names and the values
|
||||
should be the SQLAlchemy types.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from sqlalchemy import create_engine # doctest: +SKIP
|
||||
>>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
|
||||
/mydatabase";) # doctest: +SKIP
|
||||
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
|
||||
"""
|
||||
try:
|
||||
from geoalchemy2 import Geometry
|
||||
from sqlalchemy import text
|
||||
except ImportError:
|
||||
raise ImportError("'to_postgis()' requires geoalchemy2 package.")
|
||||
|
||||
gdf = gdf.copy()
|
||||
geom_name = gdf.geometry.name
|
||||
|
||||
# Get srid
|
||||
srid = _get_srid_from_crs(gdf)
|
||||
|
||||
# Get geometry type and info whether data contains LinearRing.
|
||||
geometry_type, has_curve = _get_geometry_type(gdf)
|
||||
|
||||
# Build dtype with Geometry
|
||||
if dtype is not None:
|
||||
dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
|
||||
else:
|
||||
dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
|
||||
|
||||
# Convert LinearRing geometries to LineString
|
||||
if has_curve:
|
||||
gdf = _convert_linearring_to_linestring(gdf, geom_name)
|
||||
|
||||
# Convert geometries to EWKB
|
||||
gdf = _convert_to_ewkb(gdf, geom_name, srid)
|
||||
|
||||
if schema is not None:
|
||||
schema_name = schema
|
||||
else:
|
||||
schema_name = "public"
|
||||
|
||||
if if_exists == "append":
|
||||
# Check that the geometry srid matches with the current GeoDataFrame
|
||||
with _get_conn(con) as connection:
|
||||
# Only check SRID if table exists
|
||||
if connection.dialect.has_table(connection, name, schema):
|
||||
target_srid = connection.execute(
|
||||
text(
|
||||
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
|
||||
schema=schema_name, table=name, geom_col=geom_name
|
||||
)
|
||||
)
|
||||
).fetchone()[0]
|
||||
|
||||
if target_srid != srid:
|
||||
msg = (
|
||||
"The CRS of the target table (EPSG:{epsg_t}) differs from the "
|
||||
"CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
|
||||
epsg_t=target_srid, epsg_src=srid
|
||||
)
|
||||
)
|
||||
raise ValueError(msg)
|
||||
|
||||
with _get_conn(con) as connection:
|
||||
gdf.to_sql(
|
||||
name,
|
||||
connection,
|
||||
schema=schema_name,
|
||||
if_exists=if_exists,
|
||||
index=index,
|
||||
index_label=index_label,
|
||||
chunksize=chunksize,
|
||||
dtype=dtype,
|
||||
method=_psql_insert_copy,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache
|
||||
def _get_spatial_ref_sys_df(con, srid):
|
||||
spatial_ref_sys_sql = (
|
||||
f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
|
||||
)
|
||||
return pd.read_sql(spatial_ref_sys_sql, con)
|
||||
+100
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Script to create the data and write legacy storage (pickle) files.
|
||||
|
||||
Based on pandas' generate_legacy_storage_files.py script.
|
||||
|
||||
To use this script, create an environment for which you want to
|
||||
generate pickles, activate the environment, and run this script as:
|
||||
|
||||
$ python geopandas/geopandas/io/tests/generate_legacy_storage_files.py \
|
||||
geopandas/geopandas/io/tests/data/pickle/ pickle
|
||||
|
||||
This script generates a storage file for the current arch, system,
|
||||
|
||||
The idea here is you are using the *current* version of the
|
||||
generate_legacy_storage_files with an *older* version of geopandas to
|
||||
generate a pickle file. We will then check this file into a current
|
||||
branch, and test using test_pickle.py. This will load the *older*
|
||||
pickles and test versus the current data that is generated
|
||||
(with master). These are then compared.
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import platform
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from shapely.geometry import Point
|
||||
|
||||
import geopandas
|
||||
|
||||
|
||||
def create_pickle_data():
|
||||
"""create the pickle data"""
|
||||
|
||||
# custom geometry column name
|
||||
gdf_the_geom = geopandas.GeoDataFrame(
|
||||
{"a": [1, 2, 3], "the_geom": [Point(1, 1), Point(2, 2), Point(3, 3)]},
|
||||
geometry="the_geom",
|
||||
)
|
||||
|
||||
# with crs
|
||||
gdf_crs = geopandas.GeoDataFrame(
|
||||
{"a": [0.1, 0.2, 0.3], "geometry": [Point(1, 1), Point(2, 2), Point(3, 3)]},
|
||||
crs="EPSG:4326",
|
||||
)
|
||||
|
||||
return {"gdf_the_geom": gdf_the_geom, "gdf_crs": gdf_crs}
|
||||
|
||||
|
||||
def platform_name():
|
||||
return "_".join(
|
||||
[
|
||||
str(geopandas.__version__),
|
||||
"pd-" + str(pd.__version__),
|
||||
"py-" + str(platform.python_version()),
|
||||
str(platform.machine()),
|
||||
str(platform.system().lower()),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def write_legacy_pickles(output_dir):
|
||||
print(
|
||||
"This script generates a storage file for the current arch, system, "
|
||||
"and python version"
|
||||
)
|
||||
print("geopandas version: {}").format(geopandas.__version__)
|
||||
print(" output dir : {}".format(output_dir))
|
||||
print(" storage format: pickle")
|
||||
|
||||
pth = "{}.pickle".format(platform_name())
|
||||
|
||||
fh = open(os.path.join(output_dir, pth), "wb")
|
||||
pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
|
||||
fh.close()
|
||||
|
||||
print("created pickle file: {}".format(pth))
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
sys.exit(
|
||||
"Specify output directory and storage type: generate_legacy_"
|
||||
"storage_files.py <output_dir> <storage_type> "
|
||||
)
|
||||
|
||||
output_dir = str(sys.argv[1])
|
||||
storage_type = str(sys.argv[2])
|
||||
|
||||
if storage_type == "pickle":
|
||||
write_legacy_pickles(output_dir=output_dir)
|
||||
else:
|
||||
sys.exit("storage_type must be one of {'pickle'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+328
@@ -0,0 +1,328 @@
|
||||
import os
|
||||
|
||||
from shapely.geometry import (
|
||||
LineString,
|
||||
MultiLineString,
|
||||
MultiPoint,
|
||||
MultiPolygon,
|
||||
Point,
|
||||
Polygon,
|
||||
)
|
||||
|
||||
import geopandas
|
||||
from geopandas import GeoDataFrame
|
||||
|
||||
from .test_file import FIONA_MARK, PYOGRIO_MARK
|
||||
|
||||
import pytest
|
||||
from geopandas.testing import assert_geodataframe_equal
|
||||
|
||||
# Credit: Polygons below come from Montreal city Open Data portal
|
||||
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
|
||||
city_hall_boundaries = Polygon(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
)
|
||||
vauquelin_place = Polygon(
|
||||
(
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5548825850032, 45.5084033554357),
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
)
|
||||
)
|
||||
|
||||
city_hall_walls = [
|
||||
LineString(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
)
|
||||
),
|
||||
LineString(
|
||||
(
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
city_hall_entrance = Point(-73.553785, 45.508722)
|
||||
city_hall_balcony = Point(-73.554138, 45.509080)
|
||||
city_hall_council_chamber = Point(-73.554246, 45.508931)
|
||||
|
||||
point_3D = Point(-73.553785, 45.508722, 300)
|
||||
|
||||
|
||||
# *****************************************
|
||||
# TEST TOOLING
|
||||
|
||||
|
||||
class _ExpectedError:
|
||||
def __init__(self, error_type, error_message_match):
|
||||
self.type = error_type
|
||||
self.match = error_message_match
|
||||
|
||||
|
||||
class _ExpectedErrorBuilder:
|
||||
def __init__(self, composite_key):
|
||||
self.composite_key = composite_key
|
||||
|
||||
def to_raise(self, error_type, error_match):
|
||||
_expected_exceptions[self.composite_key] = _ExpectedError(
|
||||
error_type, error_match
|
||||
)
|
||||
|
||||
|
||||
def _expect_writing(gdf, ogr_driver):
|
||||
return _ExpectedErrorBuilder(_composite_key(gdf, ogr_driver))
|
||||
|
||||
|
||||
def _composite_key(gdf, ogr_driver):
|
||||
return frozenset([id(gdf), ogr_driver])
|
||||
|
||||
|
||||
def _expected_error_on(gdf, ogr_driver):
|
||||
composite_key = _composite_key(gdf, ogr_driver)
|
||||
return _expected_exceptions.get(composite_key, None)
|
||||
|
||||
|
||||
# *****************************************
|
||||
# TEST CASES
|
||||
_geodataframes_to_write = []
|
||||
_expected_exceptions = {}
|
||||
_CRS = "epsg:4326"
|
||||
|
||||
# ------------------
|
||||
# gdf with Points
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_entrance, city_hall_balcony]
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with MultiPoints
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPoint([city_hall_balcony, city_hall_council_chamber]),
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony, city_hall_council_chamber]),
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with Points and MultiPoints
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiPoint([city_hall_entrance, city_hall_balcony]), city_hall_balcony],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
# 'ESRI Shapefile' driver supports writing LineString/MultiLinestring and
|
||||
# Polygon/MultiPolygon but does not mention Point/MultiPoint
|
||||
# see https://www.gdal.org/drv_shapefile.html
|
||||
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
|
||||
|
||||
# ------------------
|
||||
# gdf with LineStrings
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=city_hall_walls)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with MultiLineStrings
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiLineString(city_hall_walls), MultiLineString(city_hall_walls)],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with LineStrings and MultiLineStrings
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with Polygons
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]}, crs=_CRS, geometry=[city_hall_boundaries, vauquelin_place]
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with MultiPolygon
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1]},
|
||||
crs=_CRS,
|
||||
geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with Polygon and MultiPolygon
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with null geometry and Point
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, city_hall_entrance])
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with null geometry and 3D Point
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, point_3D])
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with null geometries only
|
||||
gdf = GeoDataFrame({"a": [1, 2]}, crs=_CRS, geometry=[None, None])
|
||||
_geodataframes_to_write.append(gdf)
|
||||
|
||||
# ------------------
|
||||
# gdf with all shape types mixed together
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2, 3, 4, 5, 6]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_entrance,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
# Not supported by 'ESRI Shapefile' driver
|
||||
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
|
||||
|
||||
# ------------------
|
||||
# gdf with all 2D shape types and 3D Point mixed together
|
||||
gdf = GeoDataFrame(
|
||||
{"a": [1, 2, 3, 4, 5, 6, 7]},
|
||||
crs=_CRS,
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_entrance,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
point_3D,
|
||||
],
|
||||
)
|
||||
_geodataframes_to_write.append(gdf)
|
||||
# Not supported by 'ESRI Shapefile' driver
|
||||
_expect_writing(gdf, "ESRI Shapefile").to_raise(RuntimeError, "Failed to write record")
|
||||
|
||||
|
||||
@pytest.fixture(params=_geodataframes_to_write)
|
||||
def geodataframe(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
("GeoJSON", ".geojson"),
|
||||
("ESRI Shapefile", ".shp"),
|
||||
("GPKG", ".gpkg"),
|
||||
("SQLite", ".sqlite"),
|
||||
]
|
||||
)
|
||||
def ogr_driver(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param("fiona", marks=FIONA_MARK),
|
||||
pytest.param("pyogrio", marks=PYOGRIO_MARK),
|
||||
]
|
||||
)
|
||||
def engine(request):
|
||||
return request.param
|
||||
|
||||
|
||||
def test_to_file_roundtrip(tmpdir, geodataframe, ogr_driver, engine):
|
||||
driver, ext = ogr_driver
|
||||
output_file = os.path.join(str(tmpdir), "output_file" + ext)
|
||||
write_kwargs = {}
|
||||
if driver == "SQLite":
|
||||
write_kwargs["spatialite"] = True
|
||||
|
||||
# This if statement can be removed once minimal fiona version >= 1.8.20
|
||||
if engine == "fiona":
|
||||
from packaging.version import Version
|
||||
|
||||
import fiona
|
||||
|
||||
if Version(fiona.__version__) < Version("1.8.20"):
|
||||
pytest.skip("SQLite driver only available from version 1.8.20")
|
||||
|
||||
# If only 3D Points, geometry_type needs to be specified for spatialite at the
|
||||
# moment. This if can be removed once the following PR is released:
|
||||
# https://github.com/geopandas/pyogrio/pull/223
|
||||
if (
|
||||
engine == "pyogrio"
|
||||
and len(geodataframe == 2)
|
||||
and geodataframe.geometry[0] is None
|
||||
and geodataframe.geometry[1] is not None
|
||||
and geodataframe.geometry[1].has_z
|
||||
):
|
||||
write_kwargs["geometry_type"] = "Point Z"
|
||||
|
||||
expected_error = _expected_error_on(geodataframe, driver)
|
||||
if expected_error:
|
||||
with pytest.raises(
|
||||
RuntimeError, match="Failed to write record|Could not add feature to layer"
|
||||
):
|
||||
geodataframe.to_file(
|
||||
output_file, driver=driver, engine=engine, **write_kwargs
|
||||
)
|
||||
else:
|
||||
if driver == "SQLite" and engine == "pyogrio":
|
||||
try:
|
||||
geodataframe.to_file(
|
||||
output_file, driver=driver, engine=engine, **write_kwargs
|
||||
)
|
||||
except ValueError as e:
|
||||
if "unrecognized option 'SPATIALITE'" in str(e):
|
||||
pytest.xfail(
|
||||
"pyogrio wheels from PyPI do not come with SpatiaLite support. "
|
||||
f"Error: {e}"
|
||||
)
|
||||
raise
|
||||
else:
|
||||
geodataframe.to_file(
|
||||
output_file, driver=driver, engine=engine, **write_kwargs
|
||||
)
|
||||
|
||||
reloaded = geopandas.read_file(output_file, engine=engine)
|
||||
|
||||
if driver == "GeoJSON" and engine == "pyogrio":
|
||||
# For GeoJSON files, the int64 column comes back as int32
|
||||
reloaded["a"] = reloaded["a"].astype("int64")
|
||||
|
||||
assert_geodataframe_equal(geodataframe, reloaded, check_column_type="equiv")
|
||||
@@ -0,0 +1,537 @@
|
||||
import contextlib
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
from packaging.version import Version
|
||||
|
||||
import numpy as np
|
||||
|
||||
import shapely
|
||||
from shapely import MultiPoint, Point, box
|
||||
|
||||
from geopandas import GeoDataFrame, GeoSeries
|
||||
|
||||
import pytest
|
||||
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
|
||||
|
||||
pytest.importorskip("pyarrow")
|
||||
import pyarrow as pa
|
||||
import pyarrow.compute as pc
|
||||
from pyarrow import feather
|
||||
|
||||
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
|
||||
|
||||
|
||||
def pa_table(table):
|
||||
if Version(pa.__version__) < Version("14.0.0"):
|
||||
return table._pa_table
|
||||
else:
|
||||
return pa.table(table)
|
||||
|
||||
|
||||
def pa_array(array):
|
||||
if Version(pa.__version__) < Version("14.0.0"):
|
||||
return array._pa_array
|
||||
else:
|
||||
return pa.array(array)
|
||||
|
||||
|
||||
def assert_table_equal(left, right, check_metadata=True):
|
||||
geom_type = left["geometry"].type
|
||||
# in case of Points (directly the inner fixed_size_list or struct type)
|
||||
# -> there are NaNs for empties -> we need to compare them separately
|
||||
# and then fill, because pyarrow.Table.equals considers NaNs as not equal
|
||||
if pa.types.is_fixed_size_list(geom_type):
|
||||
left_values = left["geometry"].chunk(0).values
|
||||
right_values = right["geometry"].chunk(0).values
|
||||
assert pc.is_nan(left_values).equals(pc.is_nan(right_values))
|
||||
left_geoms = pa.FixedSizeListArray.from_arrays(
|
||||
pc.replace_with_mask(left_values, pc.is_nan(left_values), 0.0),
|
||||
type=left["geometry"].type,
|
||||
)
|
||||
right_geoms = pa.FixedSizeListArray.from_arrays(
|
||||
pc.replace_with_mask(right_values, pc.is_nan(right_values), 0.0),
|
||||
type=right["geometry"].type,
|
||||
)
|
||||
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
|
||||
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
|
||||
|
||||
elif pa.types.is_struct(geom_type):
|
||||
left_arr = left["geometry"].chunk(0)
|
||||
right_arr = right["geometry"].chunk(0)
|
||||
|
||||
for i in range(left_arr.type.num_fields):
|
||||
assert pc.is_nan(left_arr.field(i)).equals(pc.is_nan(right_arr.field(i)))
|
||||
|
||||
left_geoms = pa.StructArray.from_arrays(
|
||||
[
|
||||
pc.replace_with_mask(
|
||||
left_arr.field(i), pc.is_nan(left_arr.field(i)), 0.0
|
||||
)
|
||||
for i in range(left_arr.type.num_fields)
|
||||
],
|
||||
fields=list(left["geometry"].type),
|
||||
)
|
||||
right_geoms = pa.StructArray.from_arrays(
|
||||
[
|
||||
pc.replace_with_mask(
|
||||
right_arr.field(i), pc.is_nan(right_arr.field(i)), 0.0
|
||||
)
|
||||
for i in range(right_arr.type.num_fields)
|
||||
],
|
||||
fields=list(right["geometry"].type),
|
||||
)
|
||||
|
||||
left = left.set_column(1, left.schema.field("geometry"), left_geoms)
|
||||
right = right.set_column(1, right.schema.field("geometry"), right_geoms)
|
||||
|
||||
if left.equals(right, check_metadata=check_metadata):
|
||||
return
|
||||
|
||||
if not left.schema.equals(right.schema):
|
||||
raise AssertionError(
|
||||
"Schema not equal\nLeft:\n{0}\nRight:\n{1}".format(
|
||||
left.schema, right.schema
|
||||
)
|
||||
)
|
||||
|
||||
if check_metadata:
|
||||
if not left.schema.equals(right.schema, check_metadata=True):
|
||||
if not left.schema.metadata == right.schema.metadata:
|
||||
raise AssertionError(
|
||||
"Metadata not equal\nLeft:\n{0}\nRight:\n{1}".format(
|
||||
left.schema.metadata, right.schema.metadata
|
||||
)
|
||||
)
|
||||
for col in left.schema.names:
|
||||
assert left.schema.field(col).equals(
|
||||
right.schema.field(col), check_metadata=True
|
||||
)
|
||||
|
||||
for col in left.column_names:
|
||||
a_left = pa.concat_arrays(left.column(col).chunks)
|
||||
a_right = pa.concat_arrays(right.column(col).chunks)
|
||||
if not a_left.equals(a_right):
|
||||
raise AssertionError(
|
||||
"Column '{0}' not equal:\n{1}".format(col, a_left.diff(a_right))
|
||||
)
|
||||
|
||||
raise AssertionError("Tables not equal for unknown reason")
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
shapely.geos_version < (3, 9, 0),
|
||||
reason="Checking for empty is buggy with GEOS<3.9",
|
||||
) # an old GEOS is installed in the CI builds with the defaults channel
|
||||
@pytest.mark.parametrize(
|
||||
"dim",
|
||||
[
|
||||
"xy",
|
||||
pytest.param(
|
||||
"xyz",
|
||||
marks=pytest.mark.skipif(
|
||||
shapely.geos_version < (3, 10, 0),
|
||||
reason="Cannot write 3D geometries with GEOS<3.10",
|
||||
),
|
||||
),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_encoding, interleaved",
|
||||
[("WKB", None), ("geoarrow", True), ("geoarrow", False)],
|
||||
ids=["WKB", "geoarrow-interleaved", "geoarrow-separated"],
|
||||
)
|
||||
def test_geoarrow_export(geometry_type, dim, geometry_encoding, interleaved):
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df["row_number"] = df["row_number"].astype("int32")
|
||||
df = GeoDataFrame(df)
|
||||
df.geometry.array.crs = None
|
||||
|
||||
# Read the expected data
|
||||
if geometry_encoding == "WKB":
|
||||
filename = f"example-{suffix}-wkb.arrow"
|
||||
else:
|
||||
filename = f"example-{suffix}{'-interleaved' if interleaved else ''}.arrow"
|
||||
expected = feather.read_table(base_path / filename)
|
||||
|
||||
# GeoDataFrame -> Arrow Table
|
||||
result = pa_table(
|
||||
df.to_arrow(geometry_encoding=geometry_encoding, interleaved=interleaved)
|
||||
)
|
||||
# remove the "pandas" metadata
|
||||
result = result.replace_schema_metadata(None)
|
||||
|
||||
mask_nonempty = None
|
||||
if (
|
||||
geometry_encoding == "WKB"
|
||||
and dim == "xyz"
|
||||
and geometry_type.startswith("multi")
|
||||
):
|
||||
# for collections with z dimension, drop the empties because those don't
|
||||
# roundtrip correctly to WKB
|
||||
# (https://github.com/libgeos/geos/issues/888)
|
||||
mask_nonempty = pa.array(np.asarray(~df.geometry.is_empty))
|
||||
result = result.filter(mask_nonempty)
|
||||
expected = expected.filter(mask_nonempty)
|
||||
|
||||
assert_table_equal(result, expected)
|
||||
|
||||
# GeoSeries -> Arrow array
|
||||
if geometry_encoding != "WKB" and geometry_type == "point":
|
||||
# for points, we again have to handle NaNs separately, we already did that
|
||||
# for table so let's just skip this part
|
||||
return
|
||||
result_arr = pa_array(
|
||||
df.geometry.to_arrow(
|
||||
geometry_encoding=geometry_encoding, interleaved=interleaved
|
||||
)
|
||||
)
|
||||
if mask_nonempty is not None:
|
||||
result_arr = result_arr.filter(mask_nonempty)
|
||||
assert result_arr.equals(expected["geometry"].chunk(0))
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
Version(shapely.__version__) < Version("2.0.2"),
|
||||
reason="from_ragged_array failing with read-only array input",
|
||||
)
|
||||
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
|
||||
def test_geoarrow_multiple_geometry_crs(encoding):
|
||||
pytest.importorskip("pyproj")
|
||||
# ensure each geometry column has its own crs
|
||||
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
|
||||
gdf["geom2"] = gdf.geometry.to_crs("epsg:3857")
|
||||
|
||||
result = pa_table(gdf.to_arrow(geometry_encoding=encoding))
|
||||
meta1 = json.loads(
|
||||
result.schema.field("geometry").metadata[b"ARROW:extension:metadata"]
|
||||
)
|
||||
assert json.loads(meta1["crs"])["id"]["code"] == 4326
|
||||
meta2 = json.loads(
|
||||
result.schema.field("geom2").metadata[b"ARROW:extension:metadata"]
|
||||
)
|
||||
assert json.loads(meta2["crs"])["id"]["code"] == 3857
|
||||
|
||||
roundtripped = GeoDataFrame.from_arrow(result)
|
||||
assert_geodataframe_equal(gdf, roundtripped)
|
||||
assert gdf.geometry.crs == "epsg:4326"
|
||||
assert gdf.geom2.crs == "epsg:3857"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
|
||||
def test_geoarrow_series_name_crs(encoding):
|
||||
pytest.importorskip("pyproj")
|
||||
pytest.importorskip("pyarrow", minversion="14.0.0")
|
||||
|
||||
gser = GeoSeries([box(0, 0, 10, 10)], crs="epsg:4326", name="geom")
|
||||
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
|
||||
field = pa.Field._import_from_c_capsule(schema_capsule)
|
||||
assert field.name == "geom"
|
||||
assert (
|
||||
field.metadata[b"ARROW:extension:name"] == b"geoarrow.wkb"
|
||||
if encoding == "WKB"
|
||||
else b"geoarrow.polygon"
|
||||
)
|
||||
meta = json.loads(field.metadata[b"ARROW:extension:metadata"])
|
||||
assert json.loads(meta["crs"])["id"]["code"] == 4326
|
||||
|
||||
# ensure it also works without a name
|
||||
gser = GeoSeries([box(0, 0, 10, 10)])
|
||||
schema_capsule, _ = gser.to_arrow(geometry_encoding=encoding).__arrow_c_array__()
|
||||
field = pa.Field._import_from_c_capsule(schema_capsule)
|
||||
assert field.name == ""
|
||||
|
||||
|
||||
def test_geoarrow_unsupported_encoding():
|
||||
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)], crs="epsg:4326")
|
||||
|
||||
with pytest.raises(ValueError, match="Expected geometry encoding"):
|
||||
gdf.to_arrow(geometry_encoding="invalid")
|
||||
|
||||
with pytest.raises(ValueError, match="Expected geometry encoding"):
|
||||
gdf.geometry.to_arrow(geometry_encoding="invalid")
|
||||
|
||||
|
||||
def test_geoarrow_mixed_geometry_types():
|
||||
gdf = GeoDataFrame(
|
||||
{"geometry": [Point(0, 0), box(0, 0, 10, 10)]},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Geometry type combination is not supported"):
|
||||
gdf.to_arrow(geometry_encoding="geoarrow")
|
||||
|
||||
gdf = GeoDataFrame(
|
||||
{"geometry": [Point(0, 0), MultiPoint([(0, 0), (1, 1)])]},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
result = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert (
|
||||
result.schema.field("geometry").metadata[b"ARROW:extension:name"]
|
||||
== b"geoarrow.multipoint"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("geom_type", ["point", "polygon"])
|
||||
@pytest.mark.parametrize(
|
||||
"encoding, interleaved", [("WKB", True), ("geoarrow", True), ("geoarrow", False)]
|
||||
)
|
||||
def test_geoarrow_missing(encoding, interleaved, geom_type):
|
||||
# dummy test for single geometry type until missing values are included
|
||||
# in the test data for test_geoarrow_export
|
||||
gdf = GeoDataFrame(
|
||||
geometry=[Point(0, 0) if geom_type == "point" else box(0, 0, 10, 10), None],
|
||||
crs="epsg:4326",
|
||||
)
|
||||
if (
|
||||
encoding == "geoarrow"
|
||||
and geom_type == "point"
|
||||
and interleaved
|
||||
and Version(pa.__version__) < Version("15.0.0")
|
||||
):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="Converting point geometries with missing values is not supported",
|
||||
):
|
||||
gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved)
|
||||
return
|
||||
result = pa_table(gdf.to_arrow(geometry_encoding=encoding, interleaved=interleaved))
|
||||
assert result["geometry"].null_count == 1
|
||||
assert result["geometry"].is_null().to_pylist() == [False, True]
|
||||
|
||||
|
||||
def test_geoarrow_include_z():
|
||||
gdf = GeoDataFrame({"geometry": [Point(0, 0), Point(1, 1), Point()]})
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert table["geometry"].type.value_field.name == "xy"
|
||||
assert table["geometry"].type.list_size == 2
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=True))
|
||||
assert table["geometry"].type.value_field.name == "xyz"
|
||||
assert table["geometry"].type.list_size == 3
|
||||
assert np.isnan(table["geometry"].chunk(0).values.to_numpy()[2::3]).all()
|
||||
|
||||
gdf = GeoDataFrame({"geometry": [Point(0, 0, 0), Point(1, 1, 1), Point()]})
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert table["geometry"].type.value_field.name == "xyz"
|
||||
assert table["geometry"].type.list_size == 3
|
||||
|
||||
table = pa_table(gdf.to_arrow(geometry_encoding="geoarrow", include_z=False))
|
||||
assert table["geometry"].type.value_field.name == "xy"
|
||||
assert table["geometry"].type.list_size == 2
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def with_geoarrow_extension_types():
|
||||
gp = pytest.importorskip("geoarrow.pyarrow")
|
||||
gp.register_extension_types()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
gp.unregister_extension_types()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dim", ["xy", "xyz"])
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
|
||||
)
|
||||
def test_geoarrow_export_with_extension_types(geometry_type, dim):
|
||||
# ensure the exported data can be imported by geoarrow-pyarrow and are
|
||||
# recognized as extension types
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df["row_number"] = df["row_number"].astype("int32")
|
||||
df = GeoDataFrame(df)
|
||||
df.geometry.array.crs = None
|
||||
|
||||
pytest.importorskip("geoarrow.pyarrow")
|
||||
|
||||
with with_geoarrow_extension_types():
|
||||
result1 = pa_table(df.to_arrow(geometry_encoding="WKB"))
|
||||
assert isinstance(result1["geometry"].type, pa.ExtensionType)
|
||||
|
||||
result2 = pa_table(df.to_arrow(geometry_encoding="geoarrow"))
|
||||
assert isinstance(result2["geometry"].type, pa.ExtensionType)
|
||||
|
||||
result3 = pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
|
||||
assert isinstance(result3["geometry"].type, pa.ExtensionType)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
Version(shapely.__version__) < Version("2.0.2"),
|
||||
reason="from_ragged_array failing with read-only array input",
|
||||
)
|
||||
@pytest.mark.parametrize("dim", ["xy", "xyz"])
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
[
|
||||
"point",
|
||||
"linestring",
|
||||
"polygon",
|
||||
"multipoint",
|
||||
"multilinestring",
|
||||
"multipolygon",
|
||||
],
|
||||
)
|
||||
def test_geoarrow_import(geometry_type, dim):
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df = GeoDataFrame(df)
|
||||
df.geometry.crs = None
|
||||
|
||||
table1 = feather.read_table(base_path / f"example-{suffix}-wkb.arrow")
|
||||
result1 = GeoDataFrame.from_arrow(table1)
|
||||
assert_geodataframe_equal(result1, df)
|
||||
|
||||
table2 = feather.read_table(base_path / f"example-{suffix}-interleaved.arrow")
|
||||
result2 = GeoDataFrame.from_arrow(table2)
|
||||
assert_geodataframe_equal(result2, df)
|
||||
|
||||
table3 = feather.read_table(base_path / f"example-{suffix}.arrow")
|
||||
result3 = GeoDataFrame.from_arrow(table3)
|
||||
assert_geodataframe_equal(result3, df)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
Version(shapely.__version__) < Version("2.0.2"),
|
||||
reason="from_ragged_array failing with read-only array input",
|
||||
)
|
||||
@pytest.mark.parametrize("encoding", ["WKB", "geoarrow"])
|
||||
def test_geoarrow_import_geometry_column(encoding):
|
||||
pytest.importorskip("pyproj")
|
||||
# ensure each geometry column has its own crs
|
||||
gdf = GeoDataFrame(geometry=[box(0, 0, 10, 10)])
|
||||
gdf["centroid"] = gdf.geometry.centroid
|
||||
|
||||
result = GeoDataFrame.from_arrow(pa_table(gdf.to_arrow(geometry_encoding=encoding)))
|
||||
assert_geodataframe_equal(result, gdf)
|
||||
assert result.active_geometry_name == "geometry"
|
||||
|
||||
result = GeoDataFrame.from_arrow(
|
||||
pa_table(gdf[["centroid"]].to_arrow(geometry_encoding=encoding))
|
||||
)
|
||||
assert result.active_geometry_name == "centroid"
|
||||
|
||||
result = GeoDataFrame.from_arrow(
|
||||
pa_table(gdf.to_arrow(geometry_encoding=encoding)), geometry="centroid"
|
||||
)
|
||||
assert result.active_geometry_name == "centroid"
|
||||
assert_geodataframe_equal(result, gdf.set_geometry("centroid"))
|
||||
|
||||
|
||||
def test_geoarrow_import_missing_geometry():
|
||||
pytest.importorskip("pyarrow", minversion="14.0.0")
|
||||
|
||||
table = pa.table({"a": [0, 1, 2], "b": [0.1, 0.2, 0.3]})
|
||||
with pytest.raises(ValueError, match="No geometry column found"):
|
||||
GeoDataFrame.from_arrow(table)
|
||||
|
||||
with pytest.raises(ValueError, match="No GeoArrow geometry field found"):
|
||||
GeoSeries.from_arrow(table["a"].chunk(0))
|
||||
|
||||
|
||||
def test_geoarrow_import_capsule_interface():
|
||||
# ensure we can import non-pyarrow object
|
||||
pytest.importorskip("pyarrow", minversion="14.0.0")
|
||||
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
|
||||
|
||||
result = GeoDataFrame.from_arrow(gdf.to_arrow())
|
||||
assert_geodataframe_equal(result, gdf)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dim", ["xy", "xyz"])
|
||||
@pytest.mark.parametrize(
|
||||
"geometry_type",
|
||||
["point", "linestring", "polygon", "multipoint", "multilinestring", "multipolygon"],
|
||||
)
|
||||
def test_geoarrow_import_from_extension_types(geometry_type, dim):
|
||||
# ensure the exported data can be imported by geoarrow-pyarrow and are
|
||||
# recognized as extension types
|
||||
pytest.importorskip("pyproj")
|
||||
base_path = DATA_PATH / "geoarrow"
|
||||
suffix = geometry_type + ("_z" if dim == "xyz" else "")
|
||||
|
||||
# Read the example data
|
||||
df = feather.read_feather(base_path / f"example-{suffix}-wkb.arrow")
|
||||
df["geometry"] = GeoSeries.from_wkb(df["geometry"])
|
||||
df = GeoDataFrame(df, crs="EPSG:3857")
|
||||
|
||||
pytest.importorskip("geoarrow.pyarrow")
|
||||
|
||||
with with_geoarrow_extension_types():
|
||||
result1 = GeoDataFrame.from_arrow(
|
||||
pa_table(df.to_arrow(geometry_encoding="WKB"))
|
||||
)
|
||||
assert_geodataframe_equal(result1, df)
|
||||
|
||||
result2 = GeoDataFrame.from_arrow(
|
||||
pa_table(df.to_arrow(geometry_encoding="geoarrow"))
|
||||
)
|
||||
assert_geodataframe_equal(result2, df)
|
||||
|
||||
result3 = GeoDataFrame.from_arrow(
|
||||
pa_table(df.to_arrow(geometry_encoding="geoarrow", interleaved=False))
|
||||
)
|
||||
assert_geodataframe_equal(result3, df)
|
||||
|
||||
|
||||
def test_geoarrow_import_geoseries():
|
||||
pytest.importorskip("pyproj")
|
||||
gp = pytest.importorskip("geoarrow.pyarrow")
|
||||
ser = GeoSeries.from_wkt(["POINT (1 1)", "POINT (2 2)"], crs="EPSG:3857")
|
||||
|
||||
with with_geoarrow_extension_types():
|
||||
arr = gp.array(ser.to_arrow(geometry_encoding="WKB"))
|
||||
result = GeoSeries.from_arrow(arr)
|
||||
assert_geoseries_equal(result, ser)
|
||||
|
||||
arr = gp.array(ser.to_arrow(geometry_encoding="geoarrow"))
|
||||
result = GeoSeries.from_arrow(arr)
|
||||
assert_geoseries_equal(result, ser)
|
||||
|
||||
# the name is lost when going through a pyarrow.Array
|
||||
ser.name = "name"
|
||||
arr = gp.array(ser.to_arrow())
|
||||
result = GeoSeries.from_arrow(arr)
|
||||
assert result.name is None
|
||||
# we can specify the name as one of the kwargs
|
||||
result = GeoSeries.from_arrow(arr, name="test")
|
||||
assert_geoseries_equal(result, ser)
|
||||
|
||||
|
||||
def test_geoarrow_import_unknown_geoarrow_type():
|
||||
gdf = GeoDataFrame({"col": [1]}, geometry=[box(0, 0, 10, 10)])
|
||||
table = pa_table(gdf.to_arrow())
|
||||
schema = table.schema
|
||||
new_field = schema.field("geometry").with_metadata(
|
||||
{
|
||||
b"ARROW:extension:name": b"geoarrow.unknown",
|
||||
b"ARROW:extension:metadata": b"{}",
|
||||
}
|
||||
)
|
||||
|
||||
new_schema = pa.schema([schema.field(0), new_field])
|
||||
new_table = table.cast(new_schema)
|
||||
|
||||
with pytest.raises(TypeError, match="Unknown GeoArrow extension type"):
|
||||
GeoDataFrame.from_arrow(new_table)
|
||||
@@ -0,0 +1,306 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from shapely.geometry import (
|
||||
LineString,
|
||||
MultiLineString,
|
||||
MultiPoint,
|
||||
MultiPolygon,
|
||||
Point,
|
||||
Polygon,
|
||||
)
|
||||
|
||||
from geopandas import GeoDataFrame
|
||||
from geopandas.io.file import infer_schema
|
||||
|
||||
import pytest
|
||||
|
||||
# Credit: Polygons below come from Montreal city Open Data portal
|
||||
# http://donnees.ville.montreal.qc.ca/dataset/unites-evaluation-fonciere
|
||||
city_hall_boundaries = Polygon(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
)
|
||||
vauquelin_place = Polygon(
|
||||
(
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5548825850032, 45.5084033554357),
|
||||
(-73.5542465586147, 45.5081555487952),
|
||||
)
|
||||
)
|
||||
|
||||
city_hall_walls = [
|
||||
LineString(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
(-73.5546126200639, 45.5086813829106),
|
||||
(-73.5540185061397, 45.5084409343852),
|
||||
)
|
||||
),
|
||||
LineString(
|
||||
(
|
||||
(-73.5539986525799, 45.5084323044531),
|
||||
(-73.5535801792994, 45.5089539203786),
|
||||
(-73.5541107525234, 45.5091983609661),
|
||||
)
|
||||
),
|
||||
]
|
||||
|
||||
city_hall_entrance = Point(-73.553785, 45.508722)
|
||||
city_hall_balcony = Point(-73.554138, 45.509080)
|
||||
city_hall_council_chamber = Point(-73.554246, 45.508931)
|
||||
|
||||
point_3D = Point(-73.553785, 45.508722, 300)
|
||||
linestring_3D = LineString(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661, 300),
|
||||
(-73.5546126200639, 45.5086813829106, 300),
|
||||
(-73.5540185061397, 45.5084409343852, 300),
|
||||
)
|
||||
)
|
||||
polygon_3D = Polygon(
|
||||
(
|
||||
(-73.5541107525234, 45.5091983609661, 300),
|
||||
(-73.5535801792994, 45.5089539203786, 300),
|
||||
(-73.5541107525234, 45.5091983609661, 300),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_infer_schema_only_points():
|
||||
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_points_and_multipoints():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["MultiPoint", "Point"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_multipoints():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPoint(
|
||||
[city_hall_entrance, city_hall_balcony, city_hall_council_chamber]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {"geometry": "MultiPoint", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_only_linestrings():
|
||||
df = GeoDataFrame(geometry=city_hall_walls)
|
||||
|
||||
assert infer_schema(df) == {"geometry": "LineString", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_linestrings_and_multilinestrings():
|
||||
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls), city_hall_walls[0]])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["MultiLineString", "LineString"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_multilinestrings():
|
||||
df = GeoDataFrame(geometry=[MultiLineString(city_hall_walls)])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "MultiLineString",
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_polygons():
|
||||
df = GeoDataFrame(geometry=[city_hall_boundaries, vauquelin_place])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "Polygon", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_polygons_and_multipolygons():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["MultiPolygon", "Polygon"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_multipolygons():
|
||||
df = GeoDataFrame(geometry=[MultiPolygon((city_hall_boundaries, vauquelin_place))])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "MultiPolygon", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_multiple_shape_types():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": [
|
||||
"MultiPolygon",
|
||||
"Polygon",
|
||||
"MultiLineString",
|
||||
"LineString",
|
||||
"MultiPoint",
|
||||
"Point",
|
||||
],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_shape_type():
|
||||
df = GeoDataFrame(
|
||||
geometry=[
|
||||
MultiPolygon((city_hall_boundaries, vauquelin_place)),
|
||||
city_hall_boundaries,
|
||||
MultiLineString(city_hall_walls),
|
||||
city_hall_walls[0],
|
||||
MultiPoint([city_hall_entrance, city_hall_balcony]),
|
||||
city_hall_balcony,
|
||||
point_3D,
|
||||
]
|
||||
)
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": [
|
||||
"3D Point",
|
||||
"MultiPolygon",
|
||||
"Polygon",
|
||||
"MultiLineString",
|
||||
"LineString",
|
||||
"MultiPoint",
|
||||
"Point",
|
||||
],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_Point():
|
||||
df = GeoDataFrame(geometry=[city_hall_balcony, point_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["3D Point", "Point"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_3D_Points():
|
||||
df = GeoDataFrame(geometry=[point_3D, point_3D])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_linestring():
|
||||
df = GeoDataFrame(geometry=[city_hall_walls[0], linestring_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["3D LineString", "LineString"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_3D_linestrings():
|
||||
df = GeoDataFrame(geometry=[linestring_3D, linestring_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "3D LineString",
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_mixed_3D_Polygon():
|
||||
df = GeoDataFrame(geometry=[city_hall_boundaries, polygon_3D])
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": ["3D Polygon", "Polygon"],
|
||||
"properties": OrderedDict(),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_only_3D_Polygons():
|
||||
df = GeoDataFrame(geometry=[polygon_3D, polygon_3D])
|
||||
|
||||
assert infer_schema(df) == {"geometry": "3D Polygon", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_null_geometry_and_2D_point():
|
||||
df = GeoDataFrame(geometry=[None, city_hall_entrance])
|
||||
|
||||
# None geometry type is then omitted
|
||||
assert infer_schema(df) == {"geometry": "Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_null_geometry_and_3D_point():
|
||||
df = GeoDataFrame(geometry=[None, point_3D])
|
||||
|
||||
# None geometry type is then omitted
|
||||
assert infer_schema(df) == {"geometry": "3D Point", "properties": OrderedDict()}
|
||||
|
||||
|
||||
def test_infer_schema_null_geometry_all():
|
||||
df = GeoDataFrame(geometry=[None, None])
|
||||
|
||||
# None geometry type in then replaced by 'Unknown'
|
||||
# (default geometry type supported by Fiona)
|
||||
assert infer_schema(df) == {"geometry": "Unknown", "properties": OrderedDict()}
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"array_data,dtype", [([1, 2**31 - 1], np.int32), ([1, np.nan], pd.Int32Dtype())]
|
||||
)
|
||||
def test_infer_schema_int32(array_data, dtype):
|
||||
int32col = pd.array(data=array_data, dtype=dtype)
|
||||
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
|
||||
df["int32_column"] = int32col
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "Point",
|
||||
"properties": OrderedDict([("int32_column", "int32")]),
|
||||
}
|
||||
|
||||
|
||||
def test_infer_schema_int64():
|
||||
int64col = pd.array([1, np.nan], dtype=pd.Int64Dtype())
|
||||
df = GeoDataFrame(geometry=[city_hall_entrance, city_hall_balcony])
|
||||
df["int64_column"] = int64col
|
||||
|
||||
assert infer_schema(df) == {
|
||||
"geometry": "Point",
|
||||
"properties": OrderedDict([("int64_column", "int")]),
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
"""
|
||||
See generate_legacy_storage_files.py for the creation of the legacy files.
|
||||
|
||||
"""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import pytest
|
||||
from geopandas.testing import assert_geodataframe_equal
|
||||
|
||||
DATA_PATH = pathlib.Path(os.path.dirname(__file__)) / "data"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def current_pickle_data():
|
||||
# our current version pickle data
|
||||
from .generate_legacy_storage_files import create_pickle_data
|
||||
|
||||
return create_pickle_data()
|
||||
|
||||
|
||||
files = glob.glob(str(DATA_PATH / "pickle" / "*.pickle"))
|
||||
|
||||
|
||||
@pytest.fixture(params=files, ids=[p.split("/")[-1] for p in files])
|
||||
def legacy_pickle(request):
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=(
|
||||
"shapely 2.0/pygeos-based unpickling currently only works for "
|
||||
"shapely-2.0/pygeos-written files"
|
||||
),
|
||||
)
|
||||
def test_legacy_pickles(current_pickle_data, legacy_pickle):
|
||||
result = pd.read_pickle(legacy_pickle)
|
||||
|
||||
for name, value in result.items():
|
||||
expected = current_pickle_data[name]
|
||||
assert_geodataframe_equal(value, expected)
|
||||
|
||||
|
||||
def test_round_trip_current(tmpdir, current_pickle_data):
|
||||
data = current_pickle_data
|
||||
|
||||
for name, value in data.items():
|
||||
path = str(tmpdir / "{}.pickle".format(name))
|
||||
value.to_pickle(path)
|
||||
result = pd.read_pickle(path)
|
||||
assert_geodataframe_equal(result, value)
|
||||
assert isinstance(result.has_sindex, bool)
|
||||
@@ -0,0 +1,878 @@
|
||||
"""
|
||||
Tests here include reading/writing to different types of spatial databases.
|
||||
The spatial database tests may not work without additional system
|
||||
configuration. postGIS tests require a test database to have been setup;
|
||||
see geopandas.tests.util for more information.
|
||||
"""
|
||||
|
||||
import os
|
||||
import warnings
|
||||
from importlib.util import find_spec
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import geopandas
|
||||
import geopandas._compat as compat
|
||||
from geopandas import GeoDataFrame, read_file, read_postgis
|
||||
from geopandas._compat import HAS_PYPROJ
|
||||
from geopandas.io.sql import _get_conn as get_conn
|
||||
from geopandas.io.sql import _write_postgis as write_postgis
|
||||
|
||||
import pytest
|
||||
from geopandas.tests.util import (
|
||||
create_postgis,
|
||||
create_spatialite,
|
||||
mock,
|
||||
validate_boro_df,
|
||||
)
|
||||
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
except ImportError:
|
||||
# Avoid local imports for text in all sqlalchemy tests
|
||||
# all tests using text use engine_postgis, which ensures sqlalchemy is available
|
||||
text = str
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_nybb(nybb_filename):
|
||||
df = read_file(nybb_filename)
|
||||
return df
|
||||
|
||||
|
||||
def check_available_postgis_drivers() -> list[str]:
|
||||
"""Work out which of psycopg2 and psycopg are available.
|
||||
This prevents tests running if the relevant package isn't installed
|
||||
(rather than being skipped, as skips are treated as failures during postgis CI)
|
||||
"""
|
||||
drivers = []
|
||||
if find_spec("psycopg"):
|
||||
drivers.append("psycopg")
|
||||
if find_spec("psycopg2"):
|
||||
drivers.append("psycopg2")
|
||||
return drivers
|
||||
|
||||
|
||||
POSTGIS_DRIVERS = check_available_postgis_drivers()
|
||||
|
||||
|
||||
def prepare_database_credentials() -> dict:
|
||||
"""Gather postgres connection credentials from environment variables."""
|
||||
return {
|
||||
"dbname": "test_geopandas",
|
||||
"user": os.environ.get("PGUSER"),
|
||||
"password": os.environ.get("PGPASSWORD"),
|
||||
"host": os.environ.get("PGHOST"),
|
||||
"port": os.environ.get("PGPORT"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def connection_postgis(request):
|
||||
"""Create a postgres connection using either psycopg2 or psycopg.
|
||||
|
||||
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS."""
|
||||
psycopg = pytest.importorskip(request.param)
|
||||
|
||||
try:
|
||||
con = psycopg.connect(**prepare_database_credentials())
|
||||
except psycopg.OperationalError:
|
||||
pytest.skip("Cannot connect with postgresql database")
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore", message="pandas only supports SQLAlchemy connectable.*"
|
||||
)
|
||||
yield con
|
||||
con.close()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def engine_postgis(request):
|
||||
"""
|
||||
Initiate a sqlalchemy connection engine using either psycopg2 or psycopg.
|
||||
|
||||
Use this as an indirect fixture, where the request parameter is POSTGIS_DRIVERS.
|
||||
"""
|
||||
sqlalchemy = pytest.importorskip("sqlalchemy")
|
||||
from sqlalchemy.engine.url import URL
|
||||
|
||||
credentials = prepare_database_credentials()
|
||||
try:
|
||||
con = sqlalchemy.create_engine(
|
||||
URL.create(
|
||||
drivername=f"postgresql+{request.param}",
|
||||
username=credentials["user"],
|
||||
database=credentials["dbname"],
|
||||
password=credentials["password"],
|
||||
host=credentials["host"],
|
||||
port=credentials["port"],
|
||||
)
|
||||
)
|
||||
con.connect()
|
||||
except Exception:
|
||||
pytest.skip("Cannot connect with postgresql database")
|
||||
|
||||
yield con
|
||||
con.dispose()
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def connection_spatialite():
|
||||
"""
|
||||
Return a memory-based SQLite3 connection with SpatiaLite enabled & initialized.
|
||||
|
||||
`The sqlite3 module must be built with loadable extension support
|
||||
<https://docs.python.org/3/library/sqlite3.html#f1>`_ and
|
||||
`SpatiaLite <https://www.gaia-gis.it/fossil/libspatialite/index>`_
|
||||
must be available on the system as a SQLite module.
|
||||
Packages available on Anaconda meet requirements.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
``AttributeError`` on missing support for loadable SQLite extensions
|
||||
``sqlite3.OperationalError`` on missing SpatiaLite
|
||||
"""
|
||||
sqlite3 = pytest.importorskip("sqlite3")
|
||||
try:
|
||||
with sqlite3.connect(":memory:") as con:
|
||||
con.enable_load_extension(True)
|
||||
con.load_extension("mod_spatialite")
|
||||
con.execute("SELECT InitSpatialMetaData(TRUE)")
|
||||
except Exception:
|
||||
con.close()
|
||||
pytest.skip("Cannot setup spatialite database")
|
||||
|
||||
yield con
|
||||
con.close()
|
||||
|
||||
|
||||
def drop_table_if_exists(conn_or_engine, table):
|
||||
sqlalchemy = pytest.importorskip("sqlalchemy")
|
||||
|
||||
if sqlalchemy.inspect(conn_or_engine).has_table(table):
|
||||
metadata = sqlalchemy.MetaData()
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
"ignore", message="Did not recognize type 'geometry' of column.*"
|
||||
)
|
||||
metadata.reflect(conn_or_engine)
|
||||
table = metadata.tables.get(table)
|
||||
if table is not None:
|
||||
table.drop(conn_or_engine, checkfirst=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_mixed_single_and_multi():
|
||||
from shapely.geometry import LineString, MultiLineString, Point
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{
|
||||
"geometry": [
|
||||
LineString([(0, 0), (1, 1)]),
|
||||
MultiLineString([[(0, 0), (1, 1)], [(2, 2), (3, 3)]]),
|
||||
Point(0, 1),
|
||||
]
|
||||
},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_geom_collection():
|
||||
from shapely.geometry import GeometryCollection, LineString, Point, Polygon
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{
|
||||
"geometry": [
|
||||
GeometryCollection(
|
||||
[
|
||||
Polygon([(0, 0), (1, 1), (0, 1)]),
|
||||
LineString([(0, 0), (1, 1)]),
|
||||
Point(0, 0),
|
||||
]
|
||||
)
|
||||
]
|
||||
},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_linear_ring():
|
||||
from shapely.geometry import LinearRing
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{"geometry": [LinearRing(((0, 0), (0, 1), (1, 1), (1, 0)))]}, crs="epsg:4326"
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def df_3D_geoms():
|
||||
from shapely.geometry import LineString, Point, Polygon
|
||||
|
||||
df = geopandas.GeoDataFrame(
|
||||
{
|
||||
"geometry": [
|
||||
LineString([(0, 0, 0), (1, 1, 1)]),
|
||||
Polygon([(0, 0, 0), (1, 1, 1), (0, 1, 1)]),
|
||||
Point(0, 1, 2),
|
||||
]
|
||||
},
|
||||
crs="epsg:4326",
|
||||
)
|
||||
return df
|
||||
|
||||
|
||||
class TestIO:
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_get_conn(self, engine_postgis):
|
||||
Connection = pytest.importorskip("sqlalchemy.engine.base").Connection
|
||||
|
||||
engine = engine_postgis
|
||||
with get_conn(engine) as output:
|
||||
assert isinstance(output, Connection)
|
||||
with engine.connect() as conn:
|
||||
with get_conn(conn) as output:
|
||||
assert isinstance(output, Connection)
|
||||
with pytest.raises(ValueError):
|
||||
with get_conn(object()):
|
||||
pass
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_default(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con)
|
||||
|
||||
validate_boro_df(df)
|
||||
# no crs defined on the created geodatabase, and none specified
|
||||
# by user; should not be set to 0, as from get_srid failure
|
||||
assert df.crs is None
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_custom_geom_col(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
geom_col = "the_geom"
|
||||
create_postgis(con, df_nybb, geom_col=geom_col)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con, geom_col=geom_col)
|
||||
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_select_geom_as(self, connection_postgis, df_nybb):
|
||||
"""Tests that a SELECT {geom} AS {some_other_geom} works."""
|
||||
con = connection_postgis
|
||||
orig_geom = "geom"
|
||||
out_geom = "the_geom"
|
||||
create_postgis(con, df_nybb, geom_col=orig_geom)
|
||||
|
||||
sql = """SELECT borocode, boroname, shape_leng, shape_area,
|
||||
{} as {} FROM nybb;""".format(
|
||||
orig_geom, out_geom
|
||||
)
|
||||
df = read_postgis(sql, con, geom_col=out_geom)
|
||||
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_get_srid(self, connection_postgis, df_nybb):
|
||||
"""Tests that an SRID can be read from a geodatabase (GH #451)."""
|
||||
con = connection_postgis
|
||||
crs = "epsg:4269"
|
||||
df_reproj = df_nybb.to_crs(crs)
|
||||
create_postgis(con, df_reproj, srid=4269)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con)
|
||||
|
||||
validate_boro_df(df)
|
||||
assert df.crs == crs
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_override_srid(self, connection_postgis, df_nybb):
|
||||
"""Tests that a user specified CRS overrides the geodatabase SRID."""
|
||||
con = connection_postgis
|
||||
orig_crs = df_nybb.crs
|
||||
create_postgis(con, df_nybb, srid=4269)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con, crs=orig_crs)
|
||||
|
||||
validate_boro_df(df)
|
||||
assert df.crs == orig_crs
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_from_postgis_default(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = GeoDataFrame.from_postgis(sql, con)
|
||||
|
||||
validate_boro_df(df, case_sensitive=False)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_from_postgis_custom_geom_col(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
geom_col = "the_geom"
|
||||
create_postgis(con, df_nybb, geom_col=geom_col)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = GeoDataFrame.from_postgis(sql, con, geom_col=geom_col)
|
||||
|
||||
validate_boro_df(df, case_sensitive=False)
|
||||
|
||||
def test_read_postgis_null_geom(self, connection_spatialite, df_nybb):
|
||||
"""Tests that geometry with NULL is accepted."""
|
||||
con = connection_spatialite
|
||||
geom_col = df_nybb.geometry.name
|
||||
df_nybb.geometry.iat[0] = None
|
||||
create_spatialite(con, df_nybb)
|
||||
sql = (
|
||||
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
|
||||
'AsEWKB("{0}") AS "{0}" FROM nybb'.format(geom_col)
|
||||
)
|
||||
df = read_postgis(sql, con, geom_col=geom_col)
|
||||
validate_boro_df(df)
|
||||
|
||||
def test_read_postgis_binary(self, connection_spatialite, df_nybb):
|
||||
"""Tests that geometry read as binary is accepted."""
|
||||
con = connection_spatialite
|
||||
geom_col = df_nybb.geometry.name
|
||||
create_spatialite(con, df_nybb)
|
||||
sql = (
|
||||
"SELECT ogc_fid, borocode, boroname, shape_leng, shape_area, "
|
||||
'ST_AsBinary("{0}") AS "{0}" FROM nybb'.format(geom_col)
|
||||
)
|
||||
df = read_postgis(sql, con, geom_col=geom_col)
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_postgis_chunksize(self, connection_postgis, df_nybb):
|
||||
"""Test chunksize argument"""
|
||||
chunksize = 2
|
||||
con = connection_postgis
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
|
||||
|
||||
validate_boro_df(df)
|
||||
# no crs defined on the created geodatabase, and none specified
|
||||
# by user; should not be set to 0, as from get_srid failure
|
||||
assert df.crs is None
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_default(self, engine_postgis, df_nybb):
|
||||
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
|
||||
engine = engine_postgis
|
||||
table = "nybb"
|
||||
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(engine, table)
|
||||
|
||||
# Write to db
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_uppercase_tablename(self, engine_postgis, df_nybb):
|
||||
"""Tests writing GeoDataFrame to PostGIS with uppercase tablename."""
|
||||
engine = engine_postgis
|
||||
table = "aTestTable"
|
||||
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(engine, table)
|
||||
|
||||
# Write to db
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
|
||||
# Validate
|
||||
sql = text('SELECT * FROM "{table}";'.format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_sqlalchemy_connection(self, engine_postgis, df_nybb):
|
||||
"""Tests that GeoDataFrame can be written to PostGIS with defaults."""
|
||||
with engine_postgis.begin() as con:
|
||||
table = "nybb_con"
|
||||
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(con, table)
|
||||
|
||||
# Write to db
|
||||
write_postgis(df_nybb, con=con, name=table, if_exists="fail")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, con, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_fail_when_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that uploading the same table raises error when: if_replace='fail'.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Ensure table exists
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
|
||||
try:
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="fail")
|
||||
except ValueError as e:
|
||||
if "already exists" in str(e):
|
||||
pass
|
||||
else:
|
||||
raise e
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_replace_when_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that replacing a table is possible when: if_replace='replace'.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Ensure table exists
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# Overwrite
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_append_when_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that appending to existing table produces correct results when:
|
||||
if_replace='append'.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
orig_rows, orig_cols = df_nybb.shape
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
|
||||
# Validate
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
new_rows, new_cols = df.shape
|
||||
|
||||
# There should be twice as many rows in the new table
|
||||
assert new_rows == orig_rows * 2, (
|
||||
"There should be {target} rows,found: {current}".format(
|
||||
target=orig_rows * 2, current=new_rows
|
||||
),
|
||||
)
|
||||
# Number of columns should stay the same
|
||||
assert new_cols == orig_cols, (
|
||||
"There should be {target} columns,found: {current}".format(
|
||||
target=orig_cols, current=new_cols
|
||||
),
|
||||
)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_without_crs(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that GeoDataFrame can be written to PostGIS without CRS information.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Write to db
|
||||
df_nybb.geometry.array.crs = None
|
||||
with pytest.warns(UserWarning, match="Could not parse CRS from the GeoDataF"):
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# Validate that srid is -1
|
||||
sql = text(
|
||||
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
|
||||
schema="public", table=table, geom_col="geometry"
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
target_srid = conn.execute(sql).fetchone()[0]
|
||||
assert target_srid == 0, "SRID should be 0, found %s" % target_srid
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_with_esri_authority(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that GeoDataFrame can be written to PostGIS with ESRI Authority
|
||||
CRS information (GH #2414).
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
|
||||
# Write to db
|
||||
df_nybb_esri = df_nybb.to_crs("ESRI:102003")
|
||||
write_postgis(df_nybb_esri, con=engine, name=table, if_exists="replace")
|
||||
# Validate that srid is 102003
|
||||
sql = text(
|
||||
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
|
||||
schema="public", table=table, geom_col="geometry"
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
target_srid = conn.execute(sql).fetchone()[0]
|
||||
assert target_srid == 102003, "SRID should be 102003, found %s" % target_srid
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_geometry_collection(
|
||||
self, engine_postgis, df_geom_collection
|
||||
):
|
||||
"""
|
||||
Tests that writing a mix of different geometry types is possible.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(df_geom_collection, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
geom_type = conn.execute(sql).fetchone()[0]
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
|
||||
assert geom_type.upper() == "GEOMETRYCOLLECTION"
|
||||
assert df.geom_type.unique()[0] == "GeometryCollection"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_mixed_geometry_types(
|
||||
self, engine_postgis, df_mixed_single_and_multi
|
||||
):
|
||||
"""
|
||||
Tests that writing a mix of single and MultiGeometries is possible.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(
|
||||
df_mixed_single_and_multi, con=engine, name=table, if_exists="replace"
|
||||
)
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
res = conn.execute(sql).fetchall()
|
||||
assert res[0][0].upper() == "LINESTRING"
|
||||
assert res[1][0].upper() == "MULTILINESTRING"
|
||||
assert res[2][0].upper() == "POINT"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_linear_ring(self, engine_postgis, df_linear_ring):
|
||||
"""
|
||||
Tests that writing a LinearRing.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(df_linear_ring, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT(GeometryType(geometry)) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
geom_type = conn.execute(sql).fetchone()[0]
|
||||
|
||||
assert geom_type.upper() == "LINESTRING"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_in_chunks(self, engine_postgis, df_mixed_single_and_multi):
|
||||
"""
|
||||
Tests writing a LinearRing works.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(
|
||||
df_mixed_single_and_multi,
|
||||
con=engine,
|
||||
name=table,
|
||||
if_exists="replace",
|
||||
chunksize=1,
|
||||
)
|
||||
# Validate row count
|
||||
sql = text("SELECT COUNT(geometry) FROM {table};".format(table=table))
|
||||
with engine.connect() as conn:
|
||||
row_cnt = conn.execute(sql).fetchone()[0]
|
||||
assert row_cnt == 3
|
||||
|
||||
# Validate geometry type
|
||||
sql = text(
|
||||
"SELECT DISTINCT GeometryType(geometry) FROM {table} ORDER BY 1;".format(
|
||||
table=table
|
||||
)
|
||||
)
|
||||
with engine.connect() as conn:
|
||||
res = conn.execute(sql).fetchall()
|
||||
assert res[0][0].upper() == "LINESTRING"
|
||||
assert res[1][0].upper() == "MULTILINESTRING"
|
||||
assert res[2][0].upper() == "POINT"
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_to_different_schema(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests writing data to alternative schema.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
schema_to_use = "test"
|
||||
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
|
||||
with engine.begin() as conn:
|
||||
conn.execute(sql)
|
||||
|
||||
write_postgis(
|
||||
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
|
||||
)
|
||||
# Validate
|
||||
sql = text(
|
||||
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
|
||||
)
|
||||
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_to_different_schema_when_table_exists(
|
||||
self, engine_postgis, df_nybb
|
||||
):
|
||||
"""
|
||||
Tests writing data to alternative schema.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
schema_to_use = "test"
|
||||
sql = text("CREATE SCHEMA IF NOT EXISTS {schema};".format(schema=schema_to_use))
|
||||
with engine.begin() as conn:
|
||||
conn.execute(sql)
|
||||
|
||||
try:
|
||||
write_postgis(
|
||||
df_nybb, con=engine, name=table, if_exists="fail", schema=schema_to_use
|
||||
)
|
||||
# Validate
|
||||
sql = text(
|
||||
"SELECT * FROM {schema}.{table};".format(
|
||||
schema=schema_to_use, table=table
|
||||
)
|
||||
)
|
||||
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
# Should raise a ValueError when table exists
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Try with replace flag on
|
||||
write_postgis(
|
||||
df_nybb, con=engine, name=table, if_exists="replace", schema=schema_to_use
|
||||
)
|
||||
# Validate
|
||||
sql = text(
|
||||
"SELECT * FROM {schema}.{table};".format(schema=schema_to_use, table=table)
|
||||
)
|
||||
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_write_postgis_3D_geometries(self, engine_postgis, df_3D_geoms):
|
||||
"""
|
||||
Tests writing a geometries with 3 dimensions works.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "geomtype_tests"
|
||||
|
||||
write_postgis(df_3D_geoms, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Check that all geometries have 3 dimensions
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
assert list(df.geometry.has_z) == [True, True, True]
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_row_order(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that the row order in db table follows the order of the original frame.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "row_order_test"
|
||||
correct_order = df_nybb["BoroCode"].tolist()
|
||||
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Check that the row order matches
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
assert df["BoroCode"].tolist() == correct_order
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_append_before_table_exists(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that insert works with if_exists='append' when table does not exist yet.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
# If table exists, delete it before trying to write with defaults
|
||||
drop_table_if_exists(engine, table)
|
||||
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="append")
|
||||
|
||||
# Check that the row order matches
|
||||
sql = text("SELECT * FROM {table};".format(table=table))
|
||||
df = read_postgis(sql, engine, geom_col="geometry")
|
||||
validate_boro_df(df)
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_append_with_different_crs(self, engine_postgis, df_nybb):
|
||||
"""
|
||||
Tests that the warning is raised if table CRS differs from frame.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
table = "nybb"
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
|
||||
# Reproject
|
||||
df_nybb2 = df_nybb.to_crs(epsg=4326)
|
||||
|
||||
# Should raise error when appending
|
||||
with pytest.raises(ValueError, match="CRS of the target table"):
|
||||
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_append_without_crs(self, engine_postgis, df_nybb):
|
||||
# This test was included in #3328 when the default value for no
|
||||
# CRS was changed from an SRID of -1 to 0. This resolves issues
|
||||
# of appending dataframes to postgis that have no CRS as postgis
|
||||
# no CRS value is 0.
|
||||
engine = engine_postgis
|
||||
df_nybb = df_nybb.set_crs(None, allow_override=True)
|
||||
table = "nybb"
|
||||
|
||||
write_postgis(df_nybb, con=engine, name=table, if_exists="replace")
|
||||
# append another dataframe with no crs
|
||||
|
||||
df_nybb2 = df_nybb
|
||||
write_postgis(df_nybb2, con=engine, name=table, if_exists="append")
|
||||
|
||||
@pytest.mark.parametrize("engine_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
@pytest.mark.xfail(
|
||||
compat.PANDAS_GE_20 and not compat.PANDAS_GE_202,
|
||||
reason="Duplicate columns are dropped in read_sql with pandas 2.0.0 and 2.0.1",
|
||||
)
|
||||
def test_duplicate_geometry_column_fails(self, engine_postgis):
|
||||
"""
|
||||
Tests that a ValueError is raised if an SQL query returns two geometry columns.
|
||||
"""
|
||||
engine = engine_postgis
|
||||
|
||||
sql = "select ST_MakePoint(0, 0) as geom, ST_MakePoint(0, 0) as geom;"
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
read_postgis(sql, engine, geom_col="geom")
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_non_epsg_crs(self, connection_postgis, df_nybb):
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="esri:54052")
|
||||
create_postgis(con, df_nybb, srid=54052)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = read_postgis(sql, con)
|
||||
validate_boro_df(df)
|
||||
assert df.crs == "ESRI:54052"
|
||||
|
||||
@pytest.mark.skipif(not HAS_PYPROJ, reason="pyproj not installed")
|
||||
@mock.patch("shapely.get_srid")
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_srid_not_in_table(self, mock_get_srid, connection_postgis, df_nybb):
|
||||
# mock a non-existent srid for edge case if shapely has an srid
|
||||
# not present in postgis table.
|
||||
pyproj = pytest.importorskip("pyproj")
|
||||
|
||||
mock_get_srid.return_value = 99999
|
||||
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="epsg:4326")
|
||||
create_postgis(con, df_nybb)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
with pytest.raises(pyproj.exceptions.CRSError, match="crs not found"):
|
||||
with pytest.warns(UserWarning, match="Could not find srid 99999"):
|
||||
read_postgis(sql, con)
|
||||
|
||||
@mock.patch("geopandas.io.sql._get_spatial_ref_sys_df")
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_no_spatial_ref_sys_table_in_postgis(
|
||||
self, mock_get_spatial_ref_sys_df, connection_postgis, df_nybb
|
||||
):
|
||||
# mock for a non-existent spatial_ref_sys database
|
||||
|
||||
mock_get_spatial_ref_sys_df.side_effect = pd.errors.DatabaseError
|
||||
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="epsg:4326")
|
||||
create_postgis(con, df_nybb, srid=4326)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
with pytest.warns(
|
||||
UserWarning, match="Could not find the spatial reference system table"
|
||||
):
|
||||
df = read_postgis(sql, con)
|
||||
|
||||
assert df.crs == "EPSG:4326"
|
||||
|
||||
@pytest.mark.parametrize("connection_postgis", POSTGIS_DRIVERS, indirect=True)
|
||||
def test_read_non_epsg_crs_chunksize(self, connection_postgis, df_nybb):
|
||||
"""Test chunksize argument with non epsg crs"""
|
||||
chunksize = 2
|
||||
con = connection_postgis
|
||||
df_nybb = df_nybb.to_crs(crs="esri:54052")
|
||||
|
||||
create_postgis(con, df_nybb, srid=54052)
|
||||
|
||||
sql = "SELECT * FROM nybb;"
|
||||
df = pd.concat(read_postgis(sql, con, chunksize=chunksize))
|
||||
|
||||
validate_boro_df(df)
|
||||
assert df.crs == "ESRI:54052"
|
||||
@@ -0,0 +1,118 @@
|
||||
"""Vendored, cut down version of pyogrio/util.py for use with fiona"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def vsi_path(path: str) -> str:
|
||||
"""
|
||||
Ensure path is a local path or a GDAL-compatible vsi path.
|
||||
|
||||
"""
|
||||
|
||||
# path is already in GDAL format
|
||||
if path.startswith("/vsi"):
|
||||
return path
|
||||
|
||||
# Windows drive letters (e.g. "C:\") confuse `urlparse` as they look like
|
||||
# URL schemes
|
||||
if sys.platform == "win32" and re.match("^[a-zA-Z]\\:", path):
|
||||
if not path.split("!")[0].endswith(".zip"):
|
||||
return path
|
||||
|
||||
# prefix then allow to proceed with remaining parsing
|
||||
path = f"zip://{path}"
|
||||
|
||||
path, archive, scheme = _parse_uri(path)
|
||||
|
||||
if scheme or archive or path.endswith(".zip"):
|
||||
return _construct_vsi_path(path, archive, scheme)
|
||||
|
||||
return path
|
||||
|
||||
|
||||
# Supported URI schemes and their mapping to GDAL's VSI suffix.
|
||||
SCHEMES = {
|
||||
"file": "file",
|
||||
"zip": "zip",
|
||||
"tar": "tar",
|
||||
"gzip": "gzip",
|
||||
"http": "curl",
|
||||
"https": "curl",
|
||||
"ftp": "curl",
|
||||
"s3": "s3",
|
||||
"gs": "gs",
|
||||
"az": "az",
|
||||
"adls": "adls",
|
||||
"adl": "adls", # fsspec uses this
|
||||
"hdfs": "hdfs",
|
||||
"webhdfs": "webhdfs",
|
||||
# GDAL additionally supports oss and swift for remote filesystems, but
|
||||
# those are for now not added as supported URI
|
||||
}
|
||||
|
||||
CURLSCHEMES = {k for k, v in SCHEMES.items() if v == "curl"}
|
||||
|
||||
|
||||
def _parse_uri(path: str):
|
||||
"""
|
||||
Parse a URI
|
||||
|
||||
Returns a tuples of (path, archive, scheme)
|
||||
|
||||
path : str
|
||||
Parsed path. Includes the hostname and query string in the case
|
||||
of a URI.
|
||||
archive : str
|
||||
Parsed archive path.
|
||||
scheme : str
|
||||
URI scheme such as "https" or "zip+s3".
|
||||
"""
|
||||
parts = urlparse(path, allow_fragments=False)
|
||||
|
||||
# if the scheme is not one of GDAL's supported schemes, return raw path
|
||||
if parts.scheme and not all(p in SCHEMES for p in parts.scheme.split("+")):
|
||||
return path, "", ""
|
||||
|
||||
# we have a URI
|
||||
path = parts.path
|
||||
scheme = parts.scheme or ""
|
||||
|
||||
if parts.query:
|
||||
path += "?" + parts.query
|
||||
|
||||
if parts.scheme and parts.netloc:
|
||||
path = parts.netloc + path
|
||||
|
||||
parts = path.split("!")
|
||||
path = parts.pop() if parts else ""
|
||||
archive = parts.pop() if parts else ""
|
||||
return (path, archive, scheme)
|
||||
|
||||
|
||||
def _construct_vsi_path(path, archive, scheme) -> str:
|
||||
"""Convert a parsed path to a GDAL VSI path"""
|
||||
|
||||
prefix = ""
|
||||
suffix = ""
|
||||
schemes = scheme.split("+")
|
||||
|
||||
if "zip" not in schemes and (archive.endswith(".zip") or path.endswith(".zip")):
|
||||
schemes.insert(0, "zip")
|
||||
|
||||
if schemes:
|
||||
prefix = "/".join(
|
||||
"vsi{0}".format(SCHEMES[p]) for p in schemes if p and p != "file"
|
||||
)
|
||||
|
||||
if schemes[-1] in CURLSCHEMES:
|
||||
suffix = f"{schemes[-1]}://"
|
||||
|
||||
if prefix:
|
||||
if archive:
|
||||
return "/{}/{}{}/{}".format(prefix, suffix, archive, path.lstrip("/"))
|
||||
else:
|
||||
return "/{}/{}{}".format(prefix, suffix, path)
|
||||
|
||||
return path
|
||||
Reference in New Issue
Block a user