venv

2025-01-26 19:24:23 -08:00
parent 32cd60e92b
commit d1dde0dbc6
4155 changed files with 29170 additions and 216373 deletions
--- a/.venv/lib/python3.12/site-packages/pyogrio/geopandas.py
+++ b/.venv/lib/python3.12/site-packages/pyogrio/geopandas.py
@@ -1,24 +1,24 @@
+"""Functions for reading and writing GeoPandas dataframes."""
+
 import os
+import warnings

 import numpy as np

-from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_15, PANDAS_GE_20
+from pyogrio._compat import HAS_GEOPANDAS, PANDAS_GE_15, PANDAS_GE_20, PANDAS_GE_22
+from pyogrio.errors import DataSourceError
 from pyogrio.raw import (
-    DRIVERS_NO_MIXED_SINGLE_MULTI,
    DRIVERS_NO_MIXED_DIMENSIONS,
-    detect_write_driver,
+    DRIVERS_NO_MIXED_SINGLE_MULTI,
+    _get_write_path_driver,
    read,
    read_arrow,
    write,
 )
-from pyogrio.errors import DataSourceError
-import warnings


 def _stringify_path(path):
-    """
-    Convert path-like to a string if possible, pass-through other objects
-    """
+    """Convert path-like to a string if possible, pass-through other objects."""
    if isinstance(path, str):
        return path

@@ -33,10 +33,12 @@ def _stringify_path(path):
 def _try_parse_datetime(ser):
    import pandas as pd  # only called when pandas is known to be installed

-    if PANDAS_GE_20:
-        datetime_kwargs = dict(format="ISO8601", errors="ignore")
+    if PANDAS_GE_22:
+        datetime_kwargs = {"format": "ISO8601"}
+    elif PANDAS_GE_20:
+        datetime_kwargs = {"format": "ISO8601", "errors": "ignore"}
    else:
-        datetime_kwargs = dict(yearfirst=True)
+        datetime_kwargs = {"yearfirst": True}
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
@@ -48,10 +50,13 @@ def _try_parse_datetime(ser):
        try:
            res = pd.to_datetime(ser, **datetime_kwargs)
        except Exception:
-            pass
+            res = ser
    # if object dtype, try parse as utc instead
    if res.dtype == "object":
-        res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
+        try:
+            res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
+        except Exception:
+            pass

    if res.dtype != "object":
        # GDAL only supports ms precision, convert outputs to match.
@@ -82,10 +87,12 @@ def read_dataframe(
    sql_dialect=None,
    fid_as_index=False,
    use_arrow=None,
+    on_invalid="raise",
    arrow_to_pandas_kwargs=None,
    **kwargs,
 ):
    """Read from an OGR data source to a GeoPandas GeoDataFrame or Pandas DataFrame.
+
    If the data source does not have a geometry column or ``read_geometry`` is False,
    a DataFrame will be returned.

@@ -94,20 +101,23 @@ def read_dataframe(
    Parameters
    ----------
    path_or_buffer : pathlib.Path or str, or bytes buffer
-         A dataset path or URI, or raw buffer.
+        A dataset path or URI, raw buffer, or file-like object with a read method.
    layer : int or str, optional (default: first layer)
        If an integer is provided, it corresponds to the index of the layer
        with the data source.  If a string is provided, it must match the name
        of the layer in the data source.  Defaults to first layer in data source.
    encoding : str, optional (default: None)
        If present, will be used as the encoding for reading string values from
-        the data source, unless encoding can be inferred directly from the data
-        source.
+        the data source.  By default will automatically try to detect the native
+        encoding and decode to ``UTF-8``.
    columns : list-like, optional (default: all columns)
        List of column names to import from the data source.  Column names must
        exactly match the names in the data source, and will be returned in
        the order they occur in the data source.  To avoid reading any columns,
-        pass an empty list-like.
+        pass an empty list-like.  If combined with ``where`` parameter, must
+        include columns referenced in the ``where`` expression or the data may
+        not be correctly read; the data source may return empty results or
+        raise an exception (behavior varies by driver).
    read_geometry : bool, optional (default: True)
        If True, will read geometry into a GeoSeries.  If False, a Pandas DataFrame
        will be returned instead.
@@ -152,7 +162,12 @@ def read_dataframe(
        the starting index is driver and file specific (e.g. typically 0 for
        Shapefile and 1 for GeoPackage, but can still depend on the specific
        file). The performance of reading a large number of features usings FIDs
-        is also driver specific.
+        is also driver specific and depends on the value of ``use_arrow``. The order
+        of the rows returned is undefined. If you would like to sort based on FID, use
+        ``fid_as_index=True`` to have the index of the GeoDataFrame returned set to the
+        FIDs of the features read. If ``use_arrow=True``, the number of FIDs is limited
+        to 4997 for drivers with 'OGRSQL' as default SQL dialect. To read a larger
+        number of FIDs, set ``user_arrow=False``.
    sql : str, optional (default: None)
        The SQL statement to execute. Look at the sql_dialect parameter for more
        information on the syntax to use for the query. When combined with other
@@ -184,6 +199,17 @@ def read_dataframe(
        installed). When enabled, this provides a further speed-up.
        Defaults to False, but this default can also be globally overridden
        by setting the ``PYOGRIO_USE_ARROW=1`` environment variable.
+    on_invalid : str, optional (default: "raise")
+        The action to take when an invalid geometry is encountered. Possible
+        values:
+
+        - **raise**: an exception will be raised if a WKB input geometry is
+          invalid.
+        - **warn**: invalid WKB geometries will be returned as ``None`` and a
+          warning will be raised.
+        - **ignore**: invalid WKB geometries will be returned as ``None``
+          without a warning.
+
    arrow_to_pandas_kwargs : dict, optional (default: None)
        When `use_arrow` is True, these kwargs will be passed to the `to_pandas`_
        call for the arrow to pandas conversion.
@@ -215,13 +241,13 @@ def read_dataframe(

        https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table.to_pandas

-    """  # noqa: E501
+    """
    if not HAS_GEOPANDAS:
        raise ImportError("geopandas is required to use pyogrio.read_dataframe()")

-    import pandas as pd
    import geopandas as gp
-    from geopandas.array import from_wkb
+    import pandas as pd
+
    import shapely  # if geopandas is present, shapely is expected to be present

    path_or_buffer = _stringify_path(path_or_buffer)
@@ -279,10 +305,10 @@ def read_dataframe(
            if PANDAS_GE_15 and wkb_values.dtype != object:
                # for example ArrowDtype will otherwise create numpy array with pd.NA
                wkb_values = wkb_values.to_numpy(na_value=None)
-            df["geometry"] = from_wkb(wkb_values, crs=meta["crs"])
+            df["geometry"] = shapely.from_wkb(wkb_values, on_invalid=on_invalid)
            if force_2d:
                df["geometry"] = shapely.force_2d(df["geometry"])
-            return gp.GeoDataFrame(df, geometry="geometry")
+            return gp.GeoDataFrame(df, geometry="geometry", crs=meta["crs"])
        else:
            return df

@@ -302,9 +328,9 @@ def read_dataframe(
    if geometry is None or not read_geometry:
        return df

-    geometry = from_wkb(geometry, crs=meta["crs"])
+    geometry = shapely.from_wkb(geometry, on_invalid=on_invalid)

-    return gp.GeoDataFrame(df, geometry=geometry)
+    return gp.GeoDataFrame(df, geometry=geometry, crs=meta["crs"])


 # TODO: handle index properly
@@ -318,6 +344,7 @@ def write_dataframe(
    promote_to_multi=None,
    nan_as_null=True,
    append=False,
+    use_arrow=None,
    dataset_metadata=None,
    layer_metadata=None,
    metadata=None,
@@ -325,8 +352,7 @@ def write_dataframe(
    layer_options=None,
    **kwargs,
 ):
-    """
-    Write GeoPandas GeoDataFrame to an OGR file format.
+    """Write GeoPandas GeoDataFrame to an OGR file format.

    Parameters
    ----------
@@ -335,16 +361,21 @@ def write_dataframe(
        all values will be converted to strings to be written to the
        output file, except None and np.nan, which will be set to NULL
        in the output file.
-    path : str
-        path to file
-    layer :str, optional (default: None)
-        layer name
+    path : str or io.BytesIO
+        path to output file on writeable file system or an io.BytesIO object to
+        allow writing to memory.  Will raise NotImplementedError if an open file
+        handle is passed; use BytesIO instead.
+        NOTE: support for writing to memory is limited to specific drivers.
+    layer : str, optional (default: None)
+        layer name to create.  If writing to memory and layer name is not
+        provided, it layer name will be set to a UUID4 value.
    driver : string, optional (default: None)
-        The OGR format driver used to write the vector file. By default write_dataframe
-        attempts to infer driver from path.
+        The OGR format driver used to write the vector file. By default attempts
+        to infer driver from path.  Must be provided to write to memory.
    encoding : str, optional (default: None)
        If present, will be used as the encoding for writing string values to
-        the file.
+        the file.  Use with caution, only certain drivers support encodings
+        other than UTF-8.
    geometry_type : string, optional (default: None)
        By default, the geometry type of the layer will be inferred from the
        data, after applying the promote_to_multi logic. If the data only contains a
@@ -376,8 +407,17 @@ def write_dataframe(
    append : bool, optional (default: False)
        If True, the data source specified by path already exists, and the
        driver supports appending to an existing data source, will cause the
-        data to be appended to the existing records in the data source.
+        data to be appended to the existing records in the data source.  Not
+        supported for writing to in-memory files.
        NOTE: append support is limited to specific drivers and GDAL versions.
+    use_arrow : bool, optional (default: False)
+        Whether to use Arrow as the transfer mechanism of the data to write
+        from Python to GDAL (requires GDAL >= 3.8 and `pyarrow` to be
+        installed). When enabled, this provides a further speed-up.
+        Defaults to False, but this default can also be globally overridden
+        by setting the ``PYOGRIO_USE_ARROW=1`` environment variable.
+        Using Arrow does not support writing an object-dtype column with
+        mixed types.
    dataset_metadata : dict, optional (default: None)
        Metadata to be stored at the dataset level in the output file; limited
        to drivers that support writing metadata, such as GPKG, and silently
@@ -389,10 +429,10 @@ def write_dataframe(
    metadata : dict, optional (default: None)
        alias of layer_metadata
    dataset_options : dict, optional
-        Dataset creation option (format specific) passed to OGR. Specify as
+        Dataset creation options (format specific) passed to OGR. Specify as
        a key-value dictionary.
    layer_options : dict, optional
-        Layer creation option (format specific) passed to OGR. Specify as
+        Layer creation options (format specific) passed to OGR. Specify as
        a key-value dictionary.
    **kwargs
        Additional driver-specific dataset or layer creation options passed
@@ -402,23 +442,22 @@ def write_dataframe(
        explicit `dataset_options` or `layer_options` keywords to manually
        do this (for example if an option exists as both dataset and layer
        option).
+
    """
    # TODO: add examples to the docstring (e.g. OGR kwargs)

    if not HAS_GEOPANDAS:
        raise ImportError("geopandas is required to use pyogrio.write_dataframe()")

-    from geopandas.array import to_wkb
    import pandas as pd
-    from pyproj.enums import WktVersion  # if geopandas is available so is pyproj
-
-    path = str(path)
+    from geopandas.array import to_wkb

    if not isinstance(df, pd.DataFrame):
        raise ValueError("'df' must be a DataFrame or GeoDataFrame")

-    if driver is None:
-        driver = detect_write_driver(path)
+    if use_arrow is None:
+        use_arrow = bool(int(os.environ.get("PYOGRIO_USE_ARROW", "0")))
+    path, driver = _get_write_path_driver(path, driver, append=append)

    geometry_columns = df.columns[df.dtypes == "geometry"]
    if len(geometry_columns) > 1:
@@ -456,11 +495,11 @@ def write_dataframe(
            # https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes
            # Convert each row offset to a signed multiple of 15m and add to GMT value
            gdal_offset_representation = tz_offset // pd.Timedelta("15m") + 100
-            gdal_tz_offsets[name] = gdal_offset_representation
+            gdal_tz_offsets[name] = gdal_offset_representation.values
        else:
            values = col.values
        if isinstance(values, pd.api.extensions.ExtensionArray):
-            from pandas.arrays import IntegerArray, FloatingArray, BooleanArray
+            from pandas.arrays import BooleanArray, FloatingArray, IntegerArray

            if isinstance(values, (IntegerArray, FloatingArray, BooleanArray)):
                field_data.append(values._data)
@@ -473,6 +512,9 @@ def write_dataframe(
            field_mask.append(None)

    # Determine geometry_type and/or promote_to_multi
+    if geometry_column is not None:
+        geometry_types_all = geometry.geom_type
+
    if geometry_column is not None and (
        geometry_type is None or promote_to_multi is None
    ):
@@ -482,9 +524,10 @@ def write_dataframe(
        # If there is data, infer layer geometry type + promote_to_multi
        if not df.empty:
            # None/Empty geometries sometimes report as Z incorrectly, so ignore them
-            has_z_arr = geometry[
-                (geometry != np.array(None)) & (~geometry.is_empty)
-            ].has_z
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", r"GeoSeries\.notna", UserWarning)
+                geometry_notna = geometry.notna()
+            has_z_arr = geometry[geometry_notna & (~geometry.is_empty)].has_z
            has_z = has_z_arr.any()
            all_z = has_z_arr.all()

@@ -493,7 +536,7 @@ def write_dataframe(
                    f"Mixed 2D and 3D coordinates are not supported by {driver}"
                )

-            geometry_types = pd.Series(geometry.type.unique()).dropna().values
+            geometry_types = pd.Series(geometry_types_all.unique()).dropna().values
            if len(geometry_types) == 1:
                tmp_geometry_type = geometry_types[0]
                if promote_to_multi and tmp_geometry_type in (
@@ -539,7 +582,78 @@ def write_dataframe(
        if epsg:
            crs = f"EPSG:{epsg}"
        else:
-            crs = geometry.crs.to_wkt(WktVersion.WKT1_GDAL)
+            crs = geometry.crs.to_wkt("WKT1_GDAL")
+
+    if use_arrow:
+        import pyarrow as pa
+
+        from pyogrio.raw import write_arrow
+
+        if geometry_column is not None:
+            # Convert to multi type
+            if promote_to_multi:
+                import shapely
+
+                mask_points = geometry_types_all == "Point"
+                mask_linestrings = geometry_types_all == "LineString"
+                mask_polygons = geometry_types_all == "Polygon"
+
+                if mask_points.any():
+                    geometry[mask_points] = shapely.multipoints(
+                        np.atleast_2d(geometry[mask_points]), axis=0
+                    )
+
+                if mask_linestrings.any():
+                    geometry[mask_linestrings] = shapely.multilinestrings(
+                        np.atleast_2d(geometry[mask_linestrings]), axis=0
+                    )
+
+                if mask_polygons.any():
+                    geometry[mask_polygons] = shapely.multipolygons(
+                        np.atleast_2d(geometry[mask_polygons]), axis=0
+                    )
+
+            geometry = to_wkb(geometry.values)
+            df = df.copy(deep=False)
+            # convert to plain DataFrame to avoid warning from geopandas about
+            # writing non-geometries to the geometry column
+            df = pd.DataFrame(df, copy=False)
+            df[geometry_column] = geometry
+
+        table = pa.Table.from_pandas(df, preserve_index=False)
+
+        if geometry_column is not None:
+            # ensure that the geometry column is binary (for all-null geometries,
+            # this could be a wrong type)
+            geom_field = table.schema.field(geometry_column)
+            if not (
+                pa.types.is_binary(geom_field.type)
+                or pa.types.is_large_binary(geom_field.type)
+            ):
+                table = table.set_column(
+                    table.schema.get_field_index(geometry_column),
+                    geom_field.with_type(pa.binary()),
+                    table[geometry_column].cast(pa.binary()),
+                )
+
+        write_arrow(
+            table,
+            path,
+            layer=layer,
+            driver=driver,
+            geometry_name=geometry_column,
+            geometry_type=geometry_type,
+            crs=crs,
+            encoding=encoding,
+            append=append,
+            dataset_metadata=dataset_metadata,
+            layer_metadata=layer_metadata,
+            metadata=metadata,
+            dataset_options=dataset_options,
+            layer_options=layer_options,
+            **kwargs,
+        )
+        return

    # If there is geometry data, prepare it to be written
    if geometry_column is not None: