library packages

This commit is contained in:
2024-09-28 22:56:00 -07:00
parent 64d9b78b3a
commit 1973934e95
4893 changed files with 1184173 additions and 31 deletions

View File

@@ -0,0 +1,20 @@
This package contains a modified version of ca-bundle.crt:
ca-bundle.crt -- Bundle of CA Root Certificates
This is a bundle of X.509 certificates of public Certificate Authorities
(CA). These were automatically extracted from Mozilla's root certificates
file (certdata.txt). This file can be found in the mozilla source tree:
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
It contains the certificates in PEM format and therefore
can be directly used with curl / libcurl / php_curl, or with
an Apache+mod_ssl webserver for SSL client authentication.
Just configure this file as the SSLCACertificateFile.#
***** BEGIN LICENSE BLOCK *****
This Source Code Form is subject to the terms of the Mozilla Public License,
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
one at http://mozilla.org/MPL/2.0/.
***** END LICENSE BLOCK *****
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $

View File

@@ -0,0 +1,67 @@
Metadata-Version: 2.1
Name: certifi
Version: 2024.8.30
Summary: Python package for providing Mozilla's CA Bundle.
Home-page: https://github.com/certifi/python-certifi
Author: Kenneth Reitz
Author-email: me@kennethreitz.com
License: MPL-2.0
Project-URL: Source, https://github.com/certifi/python-certifi
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
Classifier: Natural Language :: English
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Requires-Python: >=3.6
License-File: LICENSE
Certifi: Python SSL Certificates
================================
Certifi provides Mozilla's carefully curated collection of Root Certificates for
validating the trustworthiness of SSL certificates while verifying the identity
of TLS hosts. It has been extracted from the `Requests`_ project.
Installation
------------
``certifi`` is available on PyPI. Simply install it with ``pip``::
$ pip install certifi
Usage
-----
To reference the installed certificate authority (CA) bundle, you can use the
built-in function::
>>> import certifi
>>> certifi.where()
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
Or from the command line::
$ python -m certifi
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
Enjoy!
.. _`Requests`: https://requests.readthedocs.io/en/master/
Addition/Removal of Certificates
--------------------------------
Certifi does not support any addition/removal or other modification of the
CA trust store content. This project is intended to provide a reliable and
highly portable root of trust to python deployments. Look to upstream projects
for methods to use alternate trust.

View File

@@ -0,0 +1,14 @@
certifi-2024.8.30.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
certifi-2024.8.30.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
certifi-2024.8.30.dist-info/METADATA,sha256=GhBHRVUN6a4ZdUgE_N5wmukJfyuoE-QyIl8Y3ifNQBM,2222
certifi-2024.8.30.dist-info/RECORD,,
certifi-2024.8.30.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
certifi-2024.8.30.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
certifi/__init__.py,sha256=p_GYZrjUwPBUhpLlCZoGb0miKBKSqDAyZC5DvIuqbHQ,94
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
certifi/__pycache__/__init__.cpython-312.pyc,,
certifi/__pycache__/__main__.cpython-312.pyc,,
certifi/__pycache__/core.cpython-312.pyc,,
certifi/cacert.pem,sha256=lO3rZukXdPyuk6BWUJFOKQliWaXH6HGh9l1GGrUgG0c,299427
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: setuptools (74.0.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@@ -0,0 +1,4 @@
from .core import contents, where
__all__ = ["contents", "where"]
__version__ = "2024.08.30"

View File

@@ -0,0 +1,12 @@
import argparse
from certifi import contents, where
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--contents", action="store_true")
args = parser.parse_args()
if args.contents:
print(contents())
else:
print(where())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,114 @@
"""
certifi.py
~~~~~~~~~~
This module returns the installation location of cacert.pem or its contents.
"""
import sys
import atexit
def exit_cacert_ctx() -> None:
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
if sys.version_info >= (3, 11):
from importlib.resources import as_file, files
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the file
# in cases where we're inside of a zipimport situation until someone
# actually calls where(), but we don't want to re-extract the file
# on every call of where(), so we'll do it once then store it in a
# global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you to
# manage the cleanup of this file, so it doesn't actually return a
# path, it returns a context manager that will give you the path
# when you enter it and will do any cleanup when you leave it. In
# the common case of not needing a temporary file, it will just
# return the file system location and the __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
elif sys.version_info >= (3, 7):
from importlib.resources import path as get_path, read_text
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the
# file in cases where we're inside of a zipimport situation until
# someone actually calls where(), but we don't want to re-extract
# the file on every call of where(), so we'll do it once then store
# it in a global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you
# to manage the cleanup of this file, so it doesn't actually
# return a path, it returns a context manager that will give
# you the path when you enter it and will do any cleanup when
# you leave it. In the common case of not needing a temporary
# file, it will just return the file system location and the
# __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = get_path("certifi", "cacert.pem")
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")
else:
import os
import types
from typing import Union
Package = Union[types.ModuleType, str]
Resource = Union[str, "os.PathLike"]
# This fallback will work for Python versions prior to 3.7 that lack the
# importlib.resources module but relies on the existing `where` function
# so won't address issues with environments like PyOxidizer that don't set
# __file__ on modules.
def read_text(
package: Package,
resource: Resource,
encoding: str = 'utf-8',
errors: str = 'strict'
) -> str:
with open(where(), encoding=encoding) as data:
return data.read()
# If we don't have importlib.resources, then we will just do the old logic
# of assuming we're on the filesystem and munge the path directly.
def where() -> str:
f = os.path.dirname(__file__)
return os.path.join(f, "cacert.pem")
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,683 @@
Metadata-Version: 2.1
Name: charset-normalizer
Version: 3.3.2
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
Home-page: https://github.com/Ousret/charset_normalizer
Author: Ahmed TAHRI
Author-email: ahmed.tahri@cloudnursery.dev
License: MIT
Project-URL: Bug Reports, https://github.com/Ousret/charset_normalizer/issues
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/en/latest
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: MIT License
Classifier: Intended Audience :: Developers
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Text Processing :: Linguistic
Classifier: Topic :: Utilities
Classifier: Typing :: Typed
Requires-Python: >=3.7.0
Description-Content-Type: text/markdown
License-File: LICENSE
Provides-Extra: unicode_backport
<h1 align="center">Charset Detection, for Everyone 👋</h1>
<p align="center">
<sup>The Real First Universal Charset Detector</sup><br>
<a href="https://pypi.org/project/charset-normalizer">
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
</a>
<a href="https://pepy.tech/project/charset-normalizer/">
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
</a>
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
</a>
</p>
<p align="center">
<sup><i>Featured Packages</i></sup><br>
<a href="https://github.com/jawah/niquests">
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-HTTP_1.1%2C%202%2C_and_3_Client-cyan">
</a>
<a href="https://github.com/jawah/wassima">
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
</a>
</p>
<p align="center">
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
<a href="https://github.com/nickspring/charset-normalizer-rs">
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
</a>
</p>
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
> I'm trying to resolve the issue by taking a new approach.
> All IANA character set names for which the Python core library provides codecs are supported.
<p align="center">
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
</p>
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
| `Fast` | ❌ | ✅ | ✅ |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
| `Native Python` | ✅ | ✅ | ❌ |
| `Detect spoken language` | ❌ | ✅ | N/A |
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
<p align="center">
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
</p>
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
Did you got there because of the logs? See [https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html](https://charset-normalizer.readthedocs.io/en/latest/user/miscellaneous.html)
## ⚡ Performance
This package offer better performance than its counterpart Chardet. Here are some numbers.
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
| [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
| [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (eg. Supported Encoding) Challenge-them if you want.
## ✨ Installation
Using pip:
```sh
pip install charset-normalizer -U
```
## 🚀 Basic Usage
### CLI
This package comes with a CLI.
```
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
```
```bash
normalizer ./data/sample.1.fr.srt
```
or
```bash
python -m charset_normalizer ./data/sample.1.fr.srt
```
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
```json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
```
### Python
*Just print out normalized text*
```python
from charset_normalizer import from_path
results = from_path('./my_subtitle.srt')
print(str(results.best()))
```
*Upgrade your code without effort*
```python
from charset_normalizer import detect
```
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
## 😇 Why
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
reliable alternative using a completely different method. Also! I never back down on a good challenge!
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
- Discard all charset encoding table that could not fit the binary content.
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
- Extract matches with the lowest mess detected.
- Additionally, we measure coherence / probe for a language.
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
**I established** some ground rules about **what is obvious** when **it seems like** a mess.
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
## ⚡ Known limitations
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
## ⚠️ About Python EOLs
**If you are running:**
- Python >=2.7,<3.5: Unsupported
- Python 3.5: charset-normalizer < 2.1
- Python 3.6: charset-normalizer < 3.1
- Python 3.7: charset-normalizer < 4.0
Upgrade your Python interpreter as soon as possible.
## 👤 Contributing
Contributions, issues and feature requests are very much welcome.<br />
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
## 📝 License
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
## 💼 For Enterprise
Professional support for charset-normalizer is available as part of the [Tidelift
Subscription][1]. Tidelift gives software development teams a single source for
purchasing and maintaining their software, with professional grade assurances
from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
# Changelog
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
### Fixed
- Unintentional memory usage regression when using large payload that match several encoding (#376)
- Regression on some detection case showcased in the documentation (#371)
### Added
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
### Changed
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
- Improved the general detection reliability based on reports from the community
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
### Added
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
### Removed
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
### Changed
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
- Minor improvement over the global detection reliability
### Added
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
- Explicit support for Python 3.12
### Fixed
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
### Added
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
### Removed
- Support for Python 3.6 (PR #260)
### Changed
- Optional speedup provided by mypy/c 1.0.1
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
### Fixed
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
### Changed
- Speedup provided by mypy/c 0.990 on Python >= 3.7
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
- Sphinx warnings when generating the documentation
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
### Added
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Removed
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
### Fixed
- Sphinx warnings when generating the documentation
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
### Changed
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Removed
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
### Deprecated
- Function `normalize` scheduled for removal in 3.0
### Changed
- Removed useless call to decode in fn is_unprintable (#206)
### Fixed
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
### Added
- Output the Unicode table version when running the CLI with `--version` (PR #194)
### Changed
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
### Fixed
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
### Removed
- Support for Python 3.5 (PR #192)
### Deprecated
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
### Added
- Explicit support for Python 3.11 (PR #164)
### Changed
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
### Fixed
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
### Changed
- Skipping the language-detection (CD) on ASCII (PR #155)
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
### Changed
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
### Fixed
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
### Changed
- Improvement over Vietnamese detection (PR #126)
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
- Avoid using too insignificant chunk (PR #137)
### Added
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
- Various detection improvement (MD+CD) (PR #117)
### Removed
- Remove redundant logging entry about detected language(s) (PR #115)
### Fixed
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
### Fixed
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
- Fix CLI crash when using --minimal output in certain cases (PR #103)
### Changed
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
### Changed
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
- The Unicode detection is slightly improved (PR #93)
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
### Removed
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
### Fixed
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
- The MANIFEST.in was not exhaustive (PR #78)
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
### Fixed
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
- Submatch factoring could be wrong in rare edge cases (PR #72)
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
- Fix line endings from CRLF to LF for certain project files (PR #67)
### Changed
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
- Allow fallback on specified encoding if any (PR #71)
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
### Changed
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
### Fixed
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
### Changed
- Public function normalize default args values were not aligned with from_bytes (PR #53)
### Added
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
### Changed
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
- utf_7 detection has been reinstated.
### Removed
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
- The exception hook on UnicodeDecodeError has been removed.
### Deprecated
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
### Fixed
- The CLI output used the relative path of the file(s). Should be absolute.
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
### Fixed
- Logger configuration/usage no longer conflict with others (PR #44)
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
### Removed
- Using standard logging instead of using the package loguru.
- Dropping nose test framework in favor of the maintained pytest.
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
- Stop support for UTF-7 that does not contain a SIG.
- Dropping PrettyTable, replaced with pure JSON output in CLI.
### Fixed
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
- Not searching properly for the BOM when trying utf32/16 parent codec.
### Changed
- Improving the package final size by compressing frequencies.json.
- Huge improvement over the larges payload.
### Added
- CLI now produces JSON consumable output.
- Return ASCII if given sequences fit. Given reasonable confidence.
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
### Fixed
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
### Fixed
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
### Fixed
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
### Changed
- Amend the previous release to allow prettytable 2.0 (PR #35)
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
### Fixed
- Fix error while using the package with a python pre-release interpreter (PR #33)
### Changed
- Dependencies refactoring, constraints revised.
### Added
- Add python 3.9 and 3.10 to the supported interpreters
MIT License
Copyright (c) 2019 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,35 @@
../../../bin/normalizer,sha256=IN_NrOru_sX9DsEdozixdt4LmcIGy_toZGEygzQOYhU,373
charset_normalizer-3.3.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
charset_normalizer-3.3.2.dist-info/LICENSE,sha256=6zGgxaT7Cbik4yBV0lweX5w1iidS_vPNcgIT0cz-4kE,1070
charset_normalizer-3.3.2.dist-info/METADATA,sha256=cfLhl5A6SI-F0oclm8w8ux9wshL1nipdeCdVnYb4AaA,33550
charset_normalizer-3.3.2.dist-info/RECORD,,
charset_normalizer-3.3.2.dist-info/WHEEL,sha256=4ZiCdXIWMxJyEClivrQv1QAHZpQh8kVYU92_ZAVwaok,152
charset_normalizer-3.3.2.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
charset_normalizer-3.3.2.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
charset_normalizer/__init__.py,sha256=UzI3xC8PhmcLRMzSgPb6minTmRq0kWznnCBJ8ZCc2XI,1577
charset_normalizer/__main__.py,sha256=JxY8bleaENOFlLRb9HfoeZCzAMnn2A1oGR5Xm2eyqg0,73
charset_normalizer/__pycache__/__init__.cpython-312.pyc,,
charset_normalizer/__pycache__/__main__.cpython-312.pyc,,
charset_normalizer/__pycache__/api.cpython-312.pyc,,
charset_normalizer/__pycache__/cd.cpython-312.pyc,,
charset_normalizer/__pycache__/constant.cpython-312.pyc,,
charset_normalizer/__pycache__/legacy.cpython-312.pyc,,
charset_normalizer/__pycache__/md.cpython-312.pyc,,
charset_normalizer/__pycache__/models.cpython-312.pyc,,
charset_normalizer/__pycache__/utils.cpython-312.pyc,,
charset_normalizer/__pycache__/version.cpython-312.pyc,,
charset_normalizer/api.py,sha256=WOlWjy6wT8SeMYFpaGbXZFN1TMXa-s8vZYfkL4G29iQ,21097
charset_normalizer/cd.py,sha256=xwZliZcTQFA3jU0c00PRiu9MNxXTFxQkFLWmMW24ZzI,12560
charset_normalizer/cli/__init__.py,sha256=D5ERp8P62llm2FuoMzydZ7d9rs8cvvLXqE-1_6oViPc,100
charset_normalizer/cli/__main__.py,sha256=2F-xURZJzo063Ye-2RLJ2wcmURpbKeAzKwpiws65dAs,9744
charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc,,
charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc,,
charset_normalizer/constant.py,sha256=p0IsOVcEbPWYPOdWhnhRbjK1YVBy6fs05C5vKC-zoxU,40481
charset_normalizer/legacy.py,sha256=T-QuVMsMeDiQEk8WSszMrzVJg_14AMeSkmHdRYhdl1k,2071
charset_normalizer/md.cpython-312-x86_64-linux-gnu.so,sha256=W654QTU3QZI6eWJ0fanScAr0_O6sL0I61fyRSdC-39Y,16064
charset_normalizer/md.py,sha256=NkSuVLK13_a8c7BxZ4cGIQ5vOtGIWOdh22WZEvjp-7U,19624
charset_normalizer/md__mypyc.cpython-312-x86_64-linux-gnu.so,sha256=IlObIV4dmRhFV8V7H-zK4rTxPzTSi9JmrWZD26JQfxI,272640
charset_normalizer/models.py,sha256=I5i0s4aKCCgLPY2tUY3pwkgFA-BUbbNxQ7hVkVTt62s,11624
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
charset_normalizer/utils.py,sha256=teiosMqzKjXyAHXnGdjSBOgnBZwx-SkBbCLrx0UXy8M,11894
charset_normalizer/version.py,sha256=iHKUfHD3kDRSyrh_BN2ojh43TA5-UZQjvbVIEFfpHDs,79

View File

@@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: bdist_wheel (0.41.2)
Root-Is-Purelib: false
Tag: cp312-cp312-manylinux_2_17_x86_64
Tag: cp312-cp312-manylinux2014_x86_64

View File

@@ -0,0 +1,2 @@
[console_scripts]
normalizer = charset_normalizer.cli:cli_detect

View File

@@ -0,0 +1 @@
charset_normalizer

View File

@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
"""
Charset-Normalizer
~~~~~~~~~~~~~~
The Real First Universal Charset Detector.
A library that helps you read text from an unknown charset encoding.
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
All IANA character set names for which the Python core library provides codecs are supported.
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
import logging
from .api import from_bytes, from_fp, from_path, is_binary
from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"is_binary",
"detect",
"CharsetMatch",
"CharsetMatches",
"__version__",
"VERSION",
"set_logging_handler",
)
# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())

View File

@@ -0,0 +1,4 @@
from .cli import cli_detect
if __name__ == "__main__":
cli_detect()

View File

@@ -0,0 +1,626 @@
import logging
from os import PathLike
from typing import BinaryIO, List, Optional, Set, Union
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
# Will most likely be controversial
# logging.addLevelName(TRACE, "TRACE")
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)
def from_bytes(
sequences: Union[bytes, bytearray],
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {0}".format(
type(sequences)
)
)
if explain:
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
),
)
elif is_too_large_sequence:
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
),
)
prioritized_encodings: List[str] = []
specified_encoding: Optional[str] = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested: Set[str] = set()
tested_but_hard_failure: List[str] = []
tested_but_soft_failure: List[str] = []
fallback_ascii: Optional[CharsetMatch] = None
fallback_u8: Optional[CharsetMatch] = None
fallback_specified: Optional[CharsetMatch] = None
results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
if cp_exclusion and encoding_iana in cp_exclusion:
continue
if encoding_iana in tested:
continue
tested.add(encoding_iana)
decoded_payload: Optional[str] = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
)
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)],
encoding=encoding_iana,
)
else:
decoded_payload = str(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :],
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
similar_soft_failure_test = True
break
if similar_soft_failure_test:
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
)
if multi_byte_bonus:
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: List[str] = []
md_ratios = []
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)
md_ratios.append(
mess_ratio(
chunk,
threshold,
explain is True and 1 <= len(cp_isolation) <= 2,
)
)
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
except (
UnicodeDecodeError
) as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if (
enable_fallback
and encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences, encoding_iana, threshold, False, [], decoded_payload
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
target_languages: List[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
),
)
cd_ratios = []
# We shall skip the CD when its about ASCII
# Most of the time its not relevant to run "language-detection" on it.
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
chunk,
language_threshold,
",".join(target_languages) if target_languages else None,
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
),
)
results.append(
CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
decoded_payload,
)
)
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
logger.debug(
"Encoding detection: %s is most likely the one.", encoding_iana
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if encoding_iana == sig_encoding:
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)
if fallback_specified:
logger.debug(
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)
if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return results
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
"""
return from_bytes(
fp.read(),
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def from_path(
path: Union[str, bytes, PathLike], # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def is_binary(
fp_or_path_or_payload: Union[PathLike, str, BinaryIO, bytes], # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: Optional[List[str]] = None,
cp_exclusion: Optional[List[str]] = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
"""
if isinstance(fp_or_path_or_payload, (str, PathLike)):
guesses = from_path(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
elif isinstance(
fp_or_path_or_payload,
(
bytes,
bytearray,
),
):
guesses = from_bytes(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
else:
guesses = from_fp(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
return not guesses

View File

@@ -0,0 +1,395 @@
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
from .constant import (
FREQUENCIES,
KO_NAMES,
LANGUAGE_SUPPORTED_COUNT,
TOO_SMALL_SEQUENCE,
ZH_NAMES,
)
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> List[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise IOError("Function not supported on multi-byte code page")
decoder = importlib.import_module(
"encodings.{}".format(iana_name)
).IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: Dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: Optional[str] = unicode_range(chunk)
if character_range is None:
continue
if is_unicode_range_secondary(character_range) is False:
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> List[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: List[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
if unicode_range(character) == primary_range:
languages.append(language)
break
return languages
@lru_cache()
def encoding_languages(iana_name: str) -> List[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
primary_range: Optional[str] = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
primary_range = specified_range
break
if primary_range is None:
return ["Latin Based"]
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> List[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> Tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents: bool = False
target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False
return target_have_accents, target_pure_latin
def alphabet_languages(
characters: List[str], ignore_non_latin: bool = False
) -> List[str]:
"""
Return associated languages associated to given characters.
"""
languages: List[Tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
for language, language_characters in FREQUENCIES.items():
target_have_accents, target_pure_latin = get_target_features(language)
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count: int = len(language_characters)
character_match_count: int = len(
[c for c in language_characters if c in characters]
)
ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(
language: str, ordered_characters: List[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError("{} not available".format(language))
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
ordered_characters_count: int = len(ordered_characters)
target_language_characters_count: int = len(FREQUENCIES[language])
large_alphabet: bool = target_language_characters_count > 26
for character, character_rank in zip(
ordered_characters, range(0, ordered_characters_count)
):
if character not in FREQUENCIES_language_set:
continue
character_rank_in_language: int = FREQUENCIES[language].index(character)
expected_projection_ratio: float = (
target_language_characters_count / ordered_characters_count
)
character_rank_projection: int = int(character_rank * expected_projection_ratio)
if (
large_alphabet is False
and abs(character_rank_projection - character_rank_in_language) > 4
):
continue
if (
large_alphabet is True
and abs(character_rank_projection - character_rank_in_language)
< target_language_characters_count / 3
):
character_approved_count += 1
continue
characters_before_source: List[str] = FREQUENCIES[language][
0:character_rank_in_language
]
characters_after_source: List[str] = FREQUENCIES[language][
character_rank_in_language:
]
characters_before: List[str] = ordered_characters[0:character_rank]
characters_after: List[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
)
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
)
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
continue
if len(characters_after_source) == 0 and after_match_count <= 4:
character_approved_count += 1
continue
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: Dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
layer_target_range: Optional[str] = None
for discovered_range in layers:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
if layer_target_range is None:
layer_target_range = character_range
if layer_target_range not in layers:
layers[layer_target_range] = character.lower()
continue
layers[layer_target_range] += character.lower()
return list(layers.values())
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: Dict[str, List[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(ratio)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]
return sorted(merge, key=lambda x: x[1], reverse=True)
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
"""
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
index_results: Dict[str, List[float]] = dict()
for result in results:
language, ratio = result
no_em_name: str = language.replace("", "")
if no_em_name not in index_results:
index_results[no_em_name] = []
index_results[no_em_name].append(ratio)
if any(len(index_results[e]) > 1 for e in index_results):
filtered_results: CoherenceMatches = []
for language in index_results:
filtered_results.append((language, max(index_results[language])))
return filtered_results
return results
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: List[Tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()
character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: List[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio: float = characters_popularity_compare(
language, popular_character_ordered
)
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
return sorted(
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
)

View File

@@ -0,0 +1,6 @@
from .__main__ import cli_detect, query_yes_no
__all__ = (
"cli_detect",
"query_yes_no",
)

View File

@@ -0,0 +1,296 @@
import argparse
import sys
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from typing import List, Optional
from unicodedata import unidata_version
import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv: Optional[List[str]] = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
__version__,
python_version(),
unidata_version,
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else "",
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: List[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
fp.write(str(best_guess))
except IOError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,54 @@
from typing import Any, Dict, Optional, Union
from warnings import warn
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE
def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
) -> Dict[str, Optional[Union[str, float]]]:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
This function is deprecated and should be used to migrate your project easily, consult the documentation for
further information. Not planned for removal.
:param byte_str: The byte sequence to examine.
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
"""
if len(kwargs):
warn(
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
)
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
"Expected object of type bytes or bytearray, got: "
"{0}".format(type(byte_str))
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
encoding = CHARDET_CORRESPONDENCE[encoding]
return {
"encoding": encoding,
"language": language,
"confidence": confidence,
}

View File

@@ -0,0 +1,615 @@
from functools import lru_cache
from logging import getLogger
from typing import List, Optional
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
TRACE,
UNICODE_SECONDARY_RANGE_KEYWORD,
)
from .utils import (
is_accentuated,
is_arabic,
is_arabic_isolated_form,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
"""
def eligible(self, character: str) -> bool:
"""
Determine if given character should be fed in.
"""
raise NotImplementedError # pragma: nocover
def feed(self, character: str) -> None:
"""
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
"""
raise NotImplementedError # pragma: nocover
def reset(self) -> None: # pragma: no cover
"""
Permit to reset the plugin to the initial state.
"""
raise NotImplementedError
@property
def ratio(self) -> float:
"""
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._punctuation_count: int = 0
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: Optional[str] = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
def reset(self) -> None: # pragma: no cover
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
def feed(self, character: str) -> None:
self._character_count += 1
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._accentuated_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._unprintable_count: int = 0
self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # pragma: no cover
self._unprintable_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # pragma: no cover
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: Optional[str] = None
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
unicode_range_b: Optional[str] = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
if self._character_count <= 24:
return 0.0
ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count: int = 0
self._bad_word_count: int = 0
self._foreign_long_count: int = 0
self._is_current_word_bad: bool = False
self._foreign_long_watch: bool = False
self._character_count: int = 0
self._bad_character_count: int = 0
self._buffer: str = ""
self._buffer_accent_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
return
if not self._buffer:
return
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length: int = len(self._buffer)
self._character_count += buffer_length
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length > 0.34:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
if (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
for c, i in zip(self._buffer, range(0, buffer_length))
if c.isupper()
]
probable_camel_cased: bool = False
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
probable_camel_cased = True
if not probable_camel_cased:
self._foreign_long_count += 1
self._is_current_word_bad = True
if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # pragma: no cover
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0
@property
def ratio(self) -> float:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0
return self._bad_character_count / self._character_count
class CjkInvalidStopPlugin(MessDetectorPlugin):
"""
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '' and ''.
"""
def __init__(self) -> None:
self._wrong_stop_count: int = 0
self._cjk_character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character in {"", ""}:
self._wrong_stop_count += 1
return
if is_cjk(character):
self._cjk_character_count += 1
def reset(self) -> None: # pragma: no cover
self._wrong_stop_count = 0
self._cjk_character_count = 0
@property
def ratio(self) -> float:
if self._cjk_character_count < 16:
return 0.0
return self._wrong_stop_count / self._cjk_character_count
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._buf: bool = False
self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final: int = 0
self._character_count: int = 0
self._last_alpha_seen: Optional[str] = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and character.isascii() is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return self._successive_upper_lower_count_final / self._character_count
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
def reset(self) -> None: # pragma: no cover
self._character_count = 0
self._isolated_form_count = 0
def eligible(self, character: str) -> bool:
return is_arabic(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_arabic_isolated_form(character):
self._isolated_form_count += 1
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
isolated_form_usage: float = self._isolated_form_count / self._character_count
return isolated_form_usage
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: Optional[str], unicode_range_b: Optional[str]
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
if unicode_range_a is None or unicode_range_b is None:
return True
if unicode_range_a == unicode_range_b:
return False
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
return False
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False
keywords_range_a, keywords_range_b = unicode_range_a.split(
" "
), unicode_range_b.split(" ")
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
continue
if el in keywords_range_b:
return False
# Japanese Exception
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: List[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
length: int = len(decoded_sequence) + 1
mean_mess_ratio: float = 0.0
if length < 512:
intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
intermediary_mean_mess_ratio_calc = 128
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum(dt.ratio for dt in detectors)
if mean_mess_ratio >= maximum_threshold:
break
if debug:
logger = getLogger("charset_normalizer")
logger.log(
TRACE,
"Mess-detector extended-analysis start. "
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
f"maximum_threshold={maximum_threshold}",
)
if len(decoded_sequence) > 16:
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
for dt in detectors: # pragma: nocover
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)

View File

@@ -0,0 +1,340 @@
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
from .constant import TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: "CoherenceMatches",
decoded_payload: Optional[str] = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: Optional[List[str]] = None
self._leaves: List[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: Optional[bytes] = None
self._output_encoding: Optional[str] = None
self._string: Optional[str] = decoded_payload
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
raise TypeError(
"__eq__ cannot be invoked on {} and {}.".format(
str(other.__class__), str(self.__class__)
)
)
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Below 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
return self.coherence > other.coherence
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
# preserve RAM usage!
if len(self._payload) >= TOO_BIG_SEQUENCE:
return self.chaos < other.chaos
return self.multi_byte_usage > other.multi_byte_usage
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - (len(str(self)) / len(self.raw))
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return "<CharsetMatch '{}' bytes({})>".format(self.encoding, self.fingerprint)
def add_submatch(self, other: "CharsetMatch") -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> List[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: List[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> List[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> List["CharsetMatch"]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> List[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: List[Optional[str]] = [
unicode_range(char) for char in str(self)
]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> List[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
self._output_payload = str(self).encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: Optional[List[CharsetMatch]] = None):
self._results: List[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: Union[int, str]) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) <= TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> Optional["CharsetMatch"]:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> Optional["CharsetMatch"]:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: Optional[str],
encoding_aliases: List[str],
alternative_encodings: List[str],
language: str,
alphabets: List[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: Optional[str],
is_preferred: bool,
):
self.path: str = path
self.unicode_path: Optional[str] = unicode_path
self.encoding: Optional[str] = encoding
self.encoding_aliases: List[str] = encoding_aliases
self.alternative_encodings: List[str] = alternative_encodings
self.language: str = language
self.alphabets: List[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> Dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)

View File

@@ -0,0 +1,421 @@
import importlib
import logging
import unicodedata
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator, List, Optional, Set, Tuple, Union
from _multibytecodec import MultibyteIncrementalDecoder
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
or "WITH MACRON" in description
or "WITH RING ABOVE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
codes: List[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> Optional[str]:
"""
Retrieve the Unicode range official name from a single character.
"""
character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
return range_name
return None
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError:
return False
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Punctuation" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Forms" in character_range and character_category != "Lo"
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
return False
return "Emoticons" in character_range or "Pictographs" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"", "+", "<", ">"}:
return True
character_category: str = unicodedata.category(character)
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "THAI" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "ARABIC" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError:
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1A" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> Optional[str]:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
if not isinstance(sequence, bytes):
raise TypeError
seq_len: int = len(sequence)
results: List[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
if encoding_iana == specified_encoding:
return encoding_iana
return None
@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
for mark in marks:
if sequence.startswith(mark):
return iana_encoding, mark
return None, b""
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
return iana_encoding not in {"utf_16", "utf_32"}
def iana_name(cp_name: str, strict: bool = True) -> str:
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
if strict:
raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
return cp_name
def range_scan(decoded_sequence: str) -> List[str]:
ranges: Set[str] = set()
for character in decoded_sequence:
character_range: Optional[str] = unicode_range(character)
if character_range is None:
continue
ranges.add(character_range)
return list(ranges)
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(
"encodings.{}".format(iana_name_a)
).IncrementalDecoder
decoder_b = importlib.import_module(
"encodings.{}".format(iana_name_b)
).IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count: int = 0
for i in range(255):
to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
return character_match_count / 254
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
"""
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: Optional[str] = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk

View File

@@ -0,0 +1,6 @@
"""
Expose version
"""
__version__ = "3.3.2"
VERSION = __version__.split(".")

View File

@@ -0,0 +1,25 @@
Copyright (c) 2013-2022, GeoPandas developers.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of GeoPandas nor the names of its contributors may
be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,56 @@
Metadata-Version: 2.1
Name: geopandas
Version: 1.0.1
Summary: Geographic pandas extensions
Author-email: Kelsey Jordahl <kjordahl@alum.mit.edu>
Maintainer: GeoPandas contributors
License: BSD 3-Clause
Project-URL: Home, https://geopandas.org
Project-URL: Repository, https://github.com/geopandas/geopandas
Keywords: GIS,cartography,pandas,shapely
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Science/Research
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Topic :: Scientific/Engineering :: GIS
Requires-Python: >=3.9
Description-Content-Type: text/x-rst
License-File: LICENSE.txt
Requires-Dist: numpy >=1.22
Requires-Dist: pyogrio >=0.7.2
Requires-Dist: packaging
Requires-Dist: pandas >=1.4.0
Requires-Dist: pyproj >=3.3.0
Requires-Dist: shapely >=2.0.0
Provides-Extra: all
Requires-Dist: psycopg-binary >=3.1.0 ; extra == 'all'
Requires-Dist: SQLAlchemy >=1.3 ; extra == 'all'
Requires-Dist: geopy ; extra == 'all'
Requires-Dist: matplotlib >=3.5.0 ; extra == 'all'
Requires-Dist: mapclassify ; extra == 'all'
Requires-Dist: xyzservices ; extra == 'all'
Requires-Dist: folium ; extra == 'all'
Requires-Dist: GeoAlchemy2 ; extra == 'all'
Requires-Dist: pyarrow >=8.0.0 ; extra == 'all'
Provides-Extra: dev
Requires-Dist: pytest >=3.1.0 ; extra == 'dev'
Requires-Dist: pytest-cov ; extra == 'dev'
Requires-Dist: pytest-xdist ; extra == 'dev'
Requires-Dist: codecov ; extra == 'dev'
Requires-Dist: black ; extra == 'dev'
Requires-Dist: pre-commit ; extra == 'dev'
GeoPandas is a project to add support for geographic data to
`pandas`_ objects.
The goal of GeoPandas is to make working with geospatial data in
python easier. It combines the capabilities of `pandas`_ and `shapely`_,
providing geospatial operations in pandas and a high-level interface
to multiple geometries to shapely. GeoPandas enables you to easily do
operations in python that would otherwise require a spatial database
such as PostGIS.
.. _pandas: https://pandas.pydata.org
.. _shapely: https://shapely.readthedocs.io/en/latest/

View File

@@ -0,0 +1,150 @@
geopandas-1.0.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
geopandas-1.0.1.dist-info/LICENSE.txt,sha256=xg76FZN6WoKlonhqJr50UpNv_-x6hFck2wdA5UU1yOo,1498
geopandas-1.0.1.dist-info/METADATA,sha256=ksAPKWU5ZSotcOPmWj7lQ7zm56Z9gb1O_lHgnOjoe2o,2214
geopandas-1.0.1.dist-info/RECORD,,
geopandas-1.0.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
geopandas-1.0.1.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
geopandas-1.0.1.dist-info/top_level.txt,sha256=L_hA4DbHsRJmcvNncFdVSeojIfuyaIX9f3dRJ2T0hQA,10
geopandas/__init__.py,sha256=xljJeGLTAa-yH8LLDx35qKfMijAbuLQQG0uot4Qyims,894
geopandas/__pycache__/__init__.cpython-312.pyc,,
geopandas/__pycache__/_compat.cpython-312.pyc,,
geopandas/__pycache__/_config.cpython-312.pyc,,
geopandas/__pycache__/_decorator.cpython-312.pyc,,
geopandas/__pycache__/_version.cpython-312.pyc,,
geopandas/__pycache__/array.cpython-312.pyc,,
geopandas/__pycache__/base.cpython-312.pyc,,
geopandas/__pycache__/conftest.cpython-312.pyc,,
geopandas/__pycache__/explore.cpython-312.pyc,,
geopandas/__pycache__/geodataframe.cpython-312.pyc,,
geopandas/__pycache__/geoseries.cpython-312.pyc,,
geopandas/__pycache__/plotting.cpython-312.pyc,,
geopandas/__pycache__/sindex.cpython-312.pyc,,
geopandas/__pycache__/testing.cpython-312.pyc,,
geopandas/_compat.py,sha256=luXlpK8V3ouZw47qEDP8J2flxpBEZygbtGG_qA5wkpU,2709
geopandas/_config.py,sha256=-wi0X7tj5o-5mOj1OjvdvwFgE1vzLOeSQguoTRgAupI,3925
geopandas/_decorator.py,sha256=Brt7MFqTmtwrcWBFb4fUQ9rbQwZ5M8pyBZpDIwfecos,1996
geopandas/_version.py,sha256=h9mvGbfjFrenrN_x9u1gH6-q7j6G2ZApD2s6SrGdzWQ,497
geopandas/array.py,sha256=BkzDL5LS9Wiw7QiMa1d7AMhIjDHtgMGRoI4Jvit_KwA,58837
geopandas/base.py,sha256=ybIgYJ9zgHsdmJMyN-sVi-9z05pSI6JXc_BTWxt6ARY,213600
geopandas/conftest.py,sha256=C_nsiUywnW_6HAyIOklinbTHbD7aKKGSkBY5BIMjgmg,1374
geopandas/datasets/__init__.py,sha256=pxIa7JvJv5kFajD632CZzbL-_cSqpI0yG4ttTx8ILQg,1012
geopandas/datasets/__pycache__/__init__.cpython-312.pyc,,
geopandas/explore.py,sha256=FkUfz9Yl-xg-iWlKKOUK_ZrmL0FKORjKIuEbc7AIg3g,37151
geopandas/geodataframe.py,sha256=gVRJDrZtpZev-0A3X28FPt9YUfwPM8-Wd_lUkfVYCpU,105949
geopandas/geoseries.py,sha256=EyX-xHgpF_ulJj1pJhNbJkyWo4Z2N5PP8LS22W3kJeU,51980
geopandas/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
geopandas/io/__pycache__/__init__.cpython-312.pyc,,
geopandas/io/__pycache__/_geoarrow.cpython-312.pyc,,
geopandas/io/__pycache__/_pyarrow_hotfix.cpython-312.pyc,,
geopandas/io/__pycache__/arrow.cpython-312.pyc,,
geopandas/io/__pycache__/file.cpython-312.pyc,,
geopandas/io/__pycache__/sql.cpython-312.pyc,,
geopandas/io/__pycache__/util.cpython-312.pyc,,
geopandas/io/_geoarrow.py,sha256=UqMx_SQ1uw5VLgf7yxaOG7Rd8NbKHUw7lVFkUNDZIP8,21155
geopandas/io/_pyarrow_hotfix.py,sha256=_bB-IREMLQOkKY4QRryJZgMqwswL7Y_rC6xAc_AClWg,2354
geopandas/io/arrow.py,sha256=tQ-6L1Gg7FI39X3TCtfNKQrR2AsIKvKOoarxflrhVYQ,31157
geopandas/io/file.py,sha256=mu0N80_6Y-7ww1o35N3Ol1-pAVyIhs-jNiXS3qtMlNs,31080
geopandas/io/sql.py,sha256=0EolK6C-ihJL0FdrHemIOuFYmEXOZxH0ptAoYNCZCrw,15761
geopandas/io/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
geopandas/io/tests/__pycache__/__init__.cpython-312.pyc,,
geopandas/io/tests/__pycache__/generate_legacy_storage_files.cpython-312.pyc,,
geopandas/io/tests/__pycache__/test_arrow.cpython-312.pyc,,
geopandas/io/tests/__pycache__/test_file.cpython-312.pyc,,
geopandas/io/tests/__pycache__/test_file_geom_types_drivers.cpython-312.pyc,,
geopandas/io/tests/__pycache__/test_geoarrow.cpython-312.pyc,,
geopandas/io/tests/__pycache__/test_infer_schema.cpython-312.pyc,,
geopandas/io/tests/__pycache__/test_pickle.cpython-312.pyc,,
geopandas/io/tests/__pycache__/test_sql.cpython-312.pyc,,
geopandas/io/tests/generate_legacy_storage_files.py,sha256=I_nU6NGHRJAlWhlYDP88riHBX-YIzOazq8oos7ZwmaQ,2746
geopandas/io/tests/test_arrow.py,sha256=IDBtrGV-xHloDJXcEX5oJi0I8CCxlsLwxlH1E5P3h20,44514
geopandas/io/tests/test_file.py,sha256=snteWVjdj6yic5RCxudEVa2KFxRq786XdSjy557DtLk,50233
geopandas/io/tests/test_file_geom_types_drivers.py,sha256=zPFAPeQQXvWfS7_2_uztbZ4V4cFcfKqvuMQWOSr0BFc,9496
geopandas/io/tests/test_geoarrow.py,sha256=fz4b6MUfoGJgVututKACQ-c07YnvxW6qPRfZDi7UDSQ,19341
geopandas/io/tests/test_infer_schema.py,sha256=GcGUL9KRI9dYIpgaapS2ew2RTvma43twGdymn1Q2QUg,8501
geopandas/io/tests/test_pickle.py,sha256=UQu3Fzx3YvHm_R13sE2PcHSb7FWNOINgNDw9obt1q84,1407
geopandas/io/tests/test_sql.py,sha256=p1W8QwiGAl6J8adNp4ae57cs0SJVupWDEvP6lpDMePE,31156
geopandas/io/util.py,sha256=hXhQm6eONrZ9Oq_SoLS54SMKvIJkSAukaGE1rDOzFWs,3023
geopandas/plotting.py,sha256=jY6Wd1md_i2oPBuhFkdn7N8IUZp_ygNx0Tcp1p1Bz1Y,34029
geopandas/sindex.py,sha256=eaHX0kT_VeyOtgciZnUE_NN2MxpUmWo3mDG3UDVF3IQ,17877
geopandas/testing.py,sha256=YvOVC_SiUhx8744Tik2vab5dbvqzp2yBSX7veZ75Go8,11182
geopandas/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
geopandas/tests/__pycache__/__init__.cpython-312.pyc,,
geopandas/tests/__pycache__/test_api.cpython-312.pyc,,
geopandas/tests/__pycache__/test_array.cpython-312.pyc,,
geopandas/tests/__pycache__/test_compat.cpython-312.pyc,,
geopandas/tests/__pycache__/test_config.cpython-312.pyc,,
geopandas/tests/__pycache__/test_crs.cpython-312.pyc,,
geopandas/tests/__pycache__/test_datasets.cpython-312.pyc,,
geopandas/tests/__pycache__/test_decorator.cpython-312.pyc,,
geopandas/tests/__pycache__/test_dissolve.cpython-312.pyc,,
geopandas/tests/__pycache__/test_explore.cpython-312.pyc,,
geopandas/tests/__pycache__/test_extension_array.cpython-312.pyc,,
geopandas/tests/__pycache__/test_geocode.cpython-312.pyc,,
geopandas/tests/__pycache__/test_geodataframe.cpython-312.pyc,,
geopandas/tests/__pycache__/test_geom_methods.cpython-312.pyc,,
geopandas/tests/__pycache__/test_geoseries.cpython-312.pyc,,
geopandas/tests/__pycache__/test_merge.cpython-312.pyc,,
geopandas/tests/__pycache__/test_op_output_types.cpython-312.pyc,,
geopandas/tests/__pycache__/test_overlay.cpython-312.pyc,,
geopandas/tests/__pycache__/test_pandas_methods.cpython-312.pyc,,
geopandas/tests/__pycache__/test_plotting.cpython-312.pyc,,
geopandas/tests/__pycache__/test_show_versions.cpython-312.pyc,,
geopandas/tests/__pycache__/test_sindex.cpython-312.pyc,,
geopandas/tests/__pycache__/test_testing.cpython-312.pyc,,
geopandas/tests/__pycache__/test_types.cpython-312.pyc,,
geopandas/tests/__pycache__/util.cpython-312.pyc,,
geopandas/tests/data/null_geom.geojson,sha256=Xof4mv2lVaGx1D2dwShkFp1tAUmk8iK-EtQKaoatgGY,506
geopandas/tests/test_api.py,sha256=xReLaZ5PA6RQXpiPDcFO2OnvUmmPfCSV1hnWLrOACGw,978
geopandas/tests/test_array.py,sha256=1NcA10jaDYg8xyUXErpRiH5EeWcIc1JQ7IwO0LSaRvI,29544
geopandas/tests/test_compat.py,sha256=YPEZtNUxJax4NZpJ5HRD9mr5PHrpPKA7K4HpVECQi0E,923
geopandas/tests/test_config.py,sha256=c19lPC70k1usUrpFdeRT2piNo3c8jJOY49pdQD67QcA,1204
geopandas/tests/test_crs.py,sha256=vIfG-m5AdbPZ6kVLC8jQiphPiwk5bK76GkcZbY1PFvg,26118
geopandas/tests/test_datasets.py,sha256=d3I2mU3KAmxelY5JgSPqgioQ2aUgsZqnr6dFQr5kKXA,455
geopandas/tests/test_decorator.py,sha256=nC7qRe2kgR1X7nFO_5olgw09BmbhUAIVAvy5phBo3yA,1474
geopandas/tests/test_dissolve.py,sha256=8zYTIFU81sYMmBmACw-vs2Xf_7hYp-rfjIE2eOfRmCY,12026
geopandas/tests/test_explore.py,sha256=w4VQb3hPRjQDUkZtuXIgbOc54TqJnjztwYKyTXoApA4,38832
geopandas/tests/test_extension_array.py,sha256=kxhz2RFlHJkD9o5uqveozvy-oilGJGNlJZ6gyFNK-uI,18635
geopandas/tests/test_geocode.py,sha256=sxmQpYSGJ5K0Uw6apYC4OfGZQeEBM_qq0HfPS9mciTU,5047
geopandas/tests/test_geodataframe.py,sha256=5PVQaVq7aZpC8zFeRKT_kzDBtpm5Zg7NeGh4rsWeVRU,59592
geopandas/tests/test_geom_methods.py,sha256=J00Ff4-9_rM6GKBJ-XITX1noDiQJ1Q-MceFKvO89zHc,85004
geopandas/tests/test_geoseries.py,sha256=4qxOTIi150R1sgYESQHe7IH5e4yflbqOKZoZ3kB-pTU,28439
geopandas/tests/test_merge.py,sha256=n7MyNnXtXWenZojR2jyGx7xf344emr4HPTvVswQ6Qvo,9901
geopandas/tests/test_op_output_types.py,sha256=Cb3SPQSf4wHS2LgaT4mEQUbF8oSTbWgxoP3Ri7EDZ4I,14569
geopandas/tests/test_overlay.py,sha256=ppO7Dog-s0aaTUic6qRJDiPBXIz3yUBOLCsQE3b81oI,29683
geopandas/tests/test_pandas_methods.py,sha256=NVDqYOGhQYrlWAH3KZt0gyDwd2DOf82Ni9SYjFr4F9k,28176
geopandas/tests/test_plotting.py,sha256=0WCYkbhuty8ofFPSCeUeSHyL2PIETPIkHnYmT8VtOmo,74479
geopandas/tests/test_show_versions.py,sha256=3rFbaoagNkP-MirR1Xu2Nq1RT7b5si-OpFKeDO6ptD0,1173
geopandas/tests/test_sindex.py,sha256=9GVAzcsFqgMquqi1Jy52-59rZc2v4c5kVcJR1Xcj0Qw,35022
geopandas/tests/test_testing.py,sha256=6dOYHkLRSr4t7_WVycFsQeS5DeU_voHN07Bevfi4mbs,5707
geopandas/tests/test_types.py,sha256=DHZW8bRbt4j2bjsWVOircZVWsKunVSv22UW4dlsZjaM,2504
geopandas/tests/util.py,sha256=CbFNLhUBTOTGwpvLvb3wVIxge7feaHflF3ISVqyuZSA,4595
geopandas/tools/__init__.py,sha256=y7lsFF8hsvU0fyeFzxVb8B1J3_m8j3ONBVdQugYFOM4,295
geopandas/tools/__pycache__/__init__.cpython-312.pyc,,
geopandas/tools/__pycache__/_random.cpython-312.pyc,,
geopandas/tools/__pycache__/_show_versions.cpython-312.pyc,,
geopandas/tools/__pycache__/clip.cpython-312.pyc,,
geopandas/tools/__pycache__/geocoding.cpython-312.pyc,,
geopandas/tools/__pycache__/hilbert_curve.cpython-312.pyc,,
geopandas/tools/__pycache__/overlay.cpython-312.pyc,,
geopandas/tools/__pycache__/sjoin.cpython-312.pyc,,
geopandas/tools/__pycache__/util.cpython-312.pyc,,
geopandas/tools/_random.py,sha256=2wlFAyOcwUDGg_qD0KvmLigJJZJRviXpvcMwOopbMq4,2533
geopandas/tools/_show_versions.py,sha256=4twOrsRAFhiqvYzVvmO3qpKfQw6Xv7ktq9_pv1Gd3Io,3813
geopandas/tools/clip.py,sha256=9xvPpNxmii5bACiHZsGrbfA4Xbh4ckfcKkYyOUbcpFA,9239
geopandas/tools/geocoding.py,sha256=62asH-d-xmzMQ20rsL1r7lu4_1NrRIcosu2w_Kow4LM,5647
geopandas/tools/hilbert_curve.py,sha256=Ec2mjT1L3uugvByqZg8n8eD8Vt2P-AlfjAtdzopvW7Y,4773
geopandas/tools/overlay.py,sha256=oo4iGPImfSofeKrU15NK_ak_y2rdiBv61FoZgc13asc,16295
geopandas/tools/sjoin.py,sha256=C8PxzWOIPLKOpsjC9ZPXq-aw-mmxtapSZLL2aDgrX6g,25800
geopandas/tools/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
geopandas/tools/tests/__pycache__/__init__.cpython-312.pyc,,
geopandas/tools/tests/__pycache__/test_clip.cpython-312.pyc,,
geopandas/tools/tests/__pycache__/test_hilbert_curve.cpython-312.pyc,,
geopandas/tools/tests/__pycache__/test_random.cpython-312.pyc,,
geopandas/tools/tests/__pycache__/test_sjoin.cpython-312.pyc,,
geopandas/tools/tests/__pycache__/test_tools.cpython-312.pyc,,
geopandas/tools/tests/test_clip.py,sha256=fMnYIRMv8G7TvQPxWX3MrJJirdEXt7bbo9I9t7SNDYc,18072
geopandas/tools/tests/test_hilbert_curve.py,sha256=JNlwQQjTB7GrzIi8yZtpKcur9O8scDI6pL2IQsSWGu0,2087
geopandas/tools/tests/test_random.py,sha256=8RdIpzn0FMYnhhPoIf1UKiq8et2QkoSzos5SiL1vEeA,1771
geopandas/tools/tests/test_sjoin.py,sha256=3JKZqi1uKjcbQ9jChqHK-imkgOXqBtd1fwL5pIadFuE,50934
geopandas/tools/tests/test_tools.py,sha256=zFmiAHBNKbSsdSmKkRFCueW_Oysz_Rca8Dvvd9wsKPg,1483
geopandas/tools/util.py,sha256=uz9uNGjocNJgaIGxZzfApz842hL7lFI3gr5e7lbC7ng,1465

View File

@@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: setuptools (70.2.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@@ -0,0 +1 @@
geopandas

View File

@@ -0,0 +1,29 @@
from geopandas._config import options
from geopandas.geoseries import GeoSeries
from geopandas.geodataframe import GeoDataFrame
from geopandas.array import points_from_xy
from geopandas.io.file import _read_file as read_file
from geopandas.io.file import _list_layers as list_layers
from geopandas.io.arrow import _read_parquet as read_parquet
from geopandas.io.arrow import _read_feather as read_feather
from geopandas.io.sql import _read_postgis as read_postgis
from geopandas.tools import sjoin, sjoin_nearest
from geopandas.tools import overlay
from geopandas.tools._show_versions import show_versions
from geopandas.tools import clip
import geopandas.datasets
# make the interactive namespace easier to use
# for `from geopandas import *` demos.
import geopandas as gpd
import pandas as pd
import numpy as np
from . import _version
__version__ = _version.get_versions()["version"]

View File

@@ -0,0 +1,92 @@
import importlib
from packaging.version import Version
import pandas as pd
import shapely
import shapely.geos
# -----------------------------------------------------------------------------
# pandas compat
# -----------------------------------------------------------------------------
PANDAS_GE_14 = Version(pd.__version__) >= Version("1.4.0rc0")
PANDAS_GE_15 = Version(pd.__version__) >= Version("1.5.0")
PANDAS_GE_20 = Version(pd.__version__) >= Version("2.0.0")
PANDAS_GE_202 = Version(pd.__version__) >= Version("2.0.2")
PANDAS_GE_21 = Version(pd.__version__) >= Version("2.1.0")
PANDAS_GE_22 = Version(pd.__version__) >= Version("2.2.0")
PANDAS_GE_30 = Version(pd.__version__) >= Version("3.0.0.dev0")
# -----------------------------------------------------------------------------
# Shapely / GEOS compat
# -----------------------------------------------------------------------------
SHAPELY_GE_204 = Version(shapely.__version__) >= Version("2.0.4")
GEOS_GE_390 = shapely.geos.geos_version >= (3, 9, 0)
GEOS_GE_310 = shapely.geos.geos_version >= (3, 10, 0)
def import_optional_dependency(name: str, extra: str = ""):
"""
Import an optional dependency.
Adapted from pandas.compat._optional::import_optional_dependency
Raises a formatted ImportError if the module is not present.
Parameters
----------
name : str
The module name.
extra : str
Additional text to include in the ImportError message.
Returns
-------
module
"""
msg = """Missing optional dependency '{name}'. {extra} "
"Use pip or conda to install {name}.""".format(
name=name, extra=extra
)
if not isinstance(name, str):
raise ValueError(
"Invalid module name: '{name}'; must be a string".format(name=name)
)
try:
module = importlib.import_module(name)
except ImportError:
raise ImportError(msg) from None
return module
# -----------------------------------------------------------------------------
# pyproj compat
# -----------------------------------------------------------------------------
try:
import pyproj # noqa: F401
HAS_PYPROJ = True
except ImportError as err:
HAS_PYPROJ = False
pyproj_import_error = str(err)
def requires_pyproj(func):
def wrapper(*args, **kwargs):
if not HAS_PYPROJ:
raise ImportError(
f"The 'pyproj' package is required for {func.__name__} to work. "
"Install it and initialize the object with a CRS before using it."
f"\nImporting pyproj resulted in: {pyproj_import_error}"
)
return func(*args, **kwargs)
return wrapper

View File

@@ -0,0 +1,133 @@
"""
Lightweight options machinery.
Based on https://github.com/topper-123/optioneer, but simplified (don't deal
with nested options, deprecated options, ..), just the attribute-style dict
like holding the options and giving a nice repr.
"""
import textwrap
import warnings
from collections import namedtuple
Option = namedtuple("Option", "key default_value doc validator callback")
class Options(object):
"""Provide attribute-style access to configuration dict."""
def __init__(self, options):
super().__setattr__("_options", options)
# populate with default values
config = {}
for key, option in options.items():
config[key] = option.default_value
super().__setattr__("_config", config)
def __setattr__(self, key, value):
# you can't set new keys
if key in self._config:
option = self._options[key]
if option.validator:
option.validator(value)
self._config[key] = value
if option.callback:
option.callback(key, value)
else:
msg = "You can only set the value of existing options"
raise AttributeError(msg)
def __getattr__(self, key):
try:
return self._config[key]
except KeyError:
raise AttributeError("No such option")
def __dir__(self):
return list(self._config.keys())
def __repr__(self):
cls = self.__class__.__name__
description = ""
for key, option in self._options.items():
descr = "{key}: {cur!r} [default: {default!r}]\n".format(
key=key, cur=self._config[key], default=option.default_value
)
description += descr
if option.doc:
doc_text = "\n".join(textwrap.wrap(option.doc, width=70))
else:
doc_text = "No description available."
doc_text = textwrap.indent(doc_text, prefix=" ")
description += doc_text + "\n"
space = "\n "
description = description.replace("\n", space)
return "{}({}{})".format(cls, space, description)
def _validate_display_precision(value):
if value is not None:
if not isinstance(value, int) or not (0 <= value <= 16):
raise ValueError("Invalid value, needs to be an integer [0-16]")
display_precision = Option(
key="display_precision",
default_value=None,
doc=(
"The precision (maximum number of decimals) of the coordinates in "
"the WKT representation in the Series/DataFrame display. "
"By default (None), it tries to infer and use 3 decimals for projected "
"coordinates and 5 decimals for geographic coordinates."
),
validator=_validate_display_precision,
callback=None,
)
def _warn_use_pygeos_deprecated(_value):
warnings.warn(
"pygeos support was removed in 1.0. "
"geopandas.use_pygeos is a no-op and will be removed in geopandas 1.1.",
stacklevel=3,
)
def _validate_io_engine(value):
if value is not None:
if value not in ("pyogrio", "fiona"):
raise ValueError(f"Expected 'pyogrio' or 'fiona', got '{value}'")
io_engine = Option(
key="io_engine",
default_value=None,
doc=(
"The default engine for ``read_file`` and ``to_file``. "
"Options are 'pyogrio' and 'fiona'."
),
validator=_validate_io_engine,
callback=None,
)
# TODO: deprecate this
use_pygeos = Option(
key="use_pygeos",
default_value=False,
doc=(
"Deprecated option previously used to enable PyGEOS. "
"It will be removed in GeoPandas 1.1."
),
validator=_warn_use_pygeos_deprecated,
callback=None,
)
options = Options(
{
"display_precision": display_precision,
"use_pygeos": use_pygeos,
"io_engine": io_engine,
}
)

View File

@@ -0,0 +1,52 @@
from textwrap import dedent
from typing import Callable, Union
# doc decorator function ported with modifications from Pandas
# https://github.com/pandas-dev/pandas/blob/master/pandas/util/_decorators.py
def doc(*docstrings: Union[str, Callable], **params) -> Callable:
"""
A decorator take docstring templates, concatenate them and perform string
substitution on it.
This decorator will add a variable "_docstring_components" to the wrapped
callable to keep track the original docstring template for potential usage.
If it should be consider as a template, it will be saved as a string.
Otherwise, it will be saved as callable, and later user __doc__ and dedent
to get docstring.
Parameters
----------
*docstrings : str or callable
The string / docstring / docstring template to be appended in order
after default docstring under callable.
**params
The string which would be used to format docstring template.
"""
def decorator(decorated: Callable) -> Callable:
# collecting docstring and docstring templates
docstring_components: list[Union[str, Callable]] = []
if decorated.__doc__:
docstring_components.append(dedent(decorated.__doc__))
for docstring in docstrings:
if hasattr(docstring, "_docstring_components"):
docstring_components.extend(docstring._docstring_components)
elif isinstance(docstring, str) or docstring.__doc__:
docstring_components.append(docstring)
# formatting templates and concatenating docstring
decorated.__doc__ = "".join(
(
component.format(**params)
if isinstance(component, str)
else dedent(component.__doc__ or "")
)
for component in docstring_components
)
decorated._docstring_components = docstring_components
return decorated
return decorator

View File

@@ -0,0 +1,21 @@
# This file was generated by 'versioneer.py' (0.29) from
# revision-control system data, or from the parent directory name of an
# unpacked source archive. Distribution tarballs contain a pre-generated copy
# of this file.
import json
version_json = '''
{
"date": "2024-07-02T14:23:16+0200",
"dirty": false,
"error": null,
"full-revisionid": "747d66ee6fcf00b819c08f11ecded53736c4652b",
"version": "1.0.1"
}
''' # END VERSION_JSON
def get_versions():
return json.loads(version_json)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,47 @@
import os.path
import geopandas
import pytest
from geopandas.tests.util import _NATURALEARTH_CITIES, _NATURALEARTH_LOWRES, _NYBB
@pytest.fixture(autouse=True)
def add_geopandas(doctest_namespace):
doctest_namespace["geopandas"] = geopandas
# Datasets used in our tests
@pytest.fixture(scope="session")
def naturalearth_lowres() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NATURALEARTH_LOWRES) or os.getenv("GITHUB_ACTIONS"):
return _NATURALEARTH_LOWRES
else:
pytest.skip("Naturalearth lowres dataset not found")
@pytest.fixture(scope="session")
def naturalearth_cities() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NATURALEARTH_CITIES) or os.getenv("GITHUB_ACTIONS"):
return _NATURALEARTH_CITIES
else:
pytest.skip("Naturalearth cities dataset not found")
@pytest.fixture(scope="session")
def nybb_filename() -> str:
# skip if data missing, unless on github actions
if os.path.isfile(_NYBB[len("zip://") :]) or os.getenv("GITHUB_ACTIONS"):
return _NYBB
else:
pytest.skip("NYBB dataset not found")
@pytest.fixture(scope="class")
def _setup_class_nybb_filename(nybb_filename, request):
"""Attach nybb_filename class attribute for unittest style setup_method"""
request.cls.nybb_filename = nybb_filename

View File

@@ -0,0 +1,25 @@
__all__ = []
available = [] # previously part of __all__
_prev_available = ["naturalearth_cities", "naturalearth_lowres", "nybb"]
def get_path(dataset):
ne_message = "https://www.naturalearthdata.com/downloads/110m-cultural-vectors/."
nybb_message = (
"the geodatasets package.\n\nfrom geodatasets import get_path\n"
"path_to_file = get_path('nybb')\n"
)
error_msg = (
"The geopandas.dataset has been deprecated and was removed in GeoPandas "
f"1.0. You can get the original '{dataset}' data from "
f"{ne_message if 'natural' in dataset else nybb_message}"
)
if dataset in _prev_available:
raise AttributeError(error_msg)
else:
error_msg = (
"The geopandas.dataset has been deprecated and "
"was removed in GeoPandas 1.0. New sample datasets are now available "
"in the geodatasets package (https://geodatasets.readthedocs.io/en/latest/)"
)
raise AttributeError(error_msg)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,614 @@
import json
from packaging.version import Version
from typing import Dict, Optional, Tuple
import numpy as np
import pandas as pd
import pyarrow as pa
from numpy.typing import NDArray
import shapely
from shapely import GeometryType
from geopandas import GeoDataFrame
from geopandas._compat import SHAPELY_GE_204
from geopandas.array import from_shapely, from_wkb
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
## GeoPandas -> GeoArrow
class ArrowTable:
"""
Wrapper class for Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.table(gdf.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_table):
self._pa_table = pa_table
def __arrow_c_stream__(self, requested_schema=None):
return self._pa_table.__arrow_c_stream__(requested_schema=requested_schema)
class GeoArrowArray:
"""
Wrapper class for a geometry array as Arrow data.
This class implements the `Arrow PyCapsule Protocol`_ (i.e. having an
``__arrow_c_array/stream__`` method). This object can then be consumed by
your Arrow implementation of choice that supports this protocol.
.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
Example
-------
>>> import pyarrow as pa
>>> pa.array(ser.to_arrow()) # doctest: +SKIP
"""
def __init__(self, pa_field, pa_array):
self._pa_array = pa_array
self._pa_field = pa_field
def __arrow_c_array__(self, requested_schema=None):
if requested_schema is not None:
raise NotImplementedError(
"Requested schema is not supported for geometry arrays"
)
return (
self._pa_field.__arrow_c_schema__(),
self._pa_array.__arrow_c_array__()[1],
)
def geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
interleaved=True,
include_z=None,
):
"""
Convert GeoDataFrame to a pyarrow.Table.
Parameters
----------
df : GeoDataFrame
The GeoDataFrame to convert.
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
geometry_encoding : {'WKB', 'geoarrow' }, default 'WKB'
The GeoArrow encoding to use for the data conversion.
interleaved : bool, default True
Only relevant for 'geoarrow' encoding. If True, the geometries'
coordinates are interleaved in a single fixed size list array.
If False, the coordinates are stored as separate arrays in a
struct type.
include_z : bool, default None
Only relevant for 'geoarrow' encoding (for WKB, the dimensionality
of the individial geometries is preserved).
If False, return 2D geometries. If True, include the third dimension
in the output (if a geometry has no third dimension, the z-coordinates
will be NaN). By default, will infer the dimensionality from the
input geometries. Note that this inference can be unreliable with
empty geometries (for a guaranteed result, it is recommended to
specify the keyword).
"""
mask = df.dtypes == "geometry"
geometry_columns = df.columns[mask]
geometry_indices = np.asarray(mask).nonzero()[0]
df_attr = pd.DataFrame(df.copy(deep=False))
# replace geometry columns with dummy values -> will get converted to
# Arrow null column (not holding any memory), so we can afterwards
# fill the resulting table with the correct geometry fields
for col in geometry_columns:
df_attr[col] = None
table = pa.Table.from_pandas(df_attr, preserve_index=index)
geometry_encoding_dict = {}
if geometry_encoding.lower() == "geoarrow":
if Version(pa.__version__) < Version("10.0.0"):
raise ValueError("Converting to 'geoarrow' requires pyarrow >= 10.0.")
# Encode all geometry columns to GeoArrow
for i, col in zip(geometry_indices, geometry_columns):
field, geom_arr = construct_geometry_array(
np.array(df[col].array),
include_z=include_z,
field_name=col,
crs=df[col].crs,
interleaved=interleaved,
)
table = table.set_column(i, field, geom_arr)
geometry_encoding_dict[col] = (
field.metadata[b"ARROW:extension:name"]
.decode()
.removeprefix("geoarrow.")
)
elif geometry_encoding.lower() == "wkb":
# Encode all geometry columns to WKB
for i, col in zip(geometry_indices, geometry_columns):
field, wkb_arr = construct_wkb_array(
np.asarray(df[col].array), field_name=col, crs=df[col].crs
)
table = table.set_column(i, field, wkb_arr)
geometry_encoding_dict[col] = "WKB"
else:
raise ValueError(
f"Expected geometry encoding 'WKB' or 'geoarrow' got {geometry_encoding}"
)
return table, geometry_encoding_dict
def construct_wkb_array(
shapely_arr: NDArray[np.object_],
*,
field_name: str = "geometry",
crs: Optional[str] = None,
) -> Tuple[pa.Field, pa.Array]:
if shapely.geos_version > (3, 10, 0):
kwargs = {"flavor": "iso"}
else:
if shapely.has_z(shapely_arr).any():
raise ValueError("Cannot write 3D geometries with GEOS<3.10")
kwargs = {}
wkb_arr = shapely.to_wkb(shapely_arr, **kwargs)
extension_metadata = {"ARROW:extension:name": "geoarrow.wkb"}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
field = pa.field(
field_name, type=pa.binary(), nullable=True, metadata=extension_metadata
)
parr = pa.array(np.asarray(wkb_arr), pa.binary())
return field, parr
def _convert_inner_coords(coords, interleaved, dims, mask=None):
if interleaved:
coords_field = pa.field(dims, pa.float64(), nullable=False)
typ = pa.list_(coords_field, len(dims))
if mask is None:
# mask keyword only added in pyarrow 15.0.0
parr = pa.FixedSizeListArray.from_arrays(coords.ravel(), type=typ)
else:
parr = pa.FixedSizeListArray.from_arrays(
coords.ravel(), type=typ, mask=mask
)
else:
if dims == "xy":
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy()], fields=fields, mask=mask
)
else:
fields = [
pa.field("x", pa.float64(), nullable=False),
pa.field("y", pa.float64(), nullable=False),
pa.field("z", pa.float64(), nullable=False),
]
parr = pa.StructArray.from_arrays(
[coords[:, 0].copy(), coords[:, 1].copy(), coords[:, 2].copy()],
fields=fields,
mask=mask,
)
return parr
def _linestring_type(point_type):
return pa.list_(pa.field("vertices", point_type, nullable=False))
def _polygon_type(point_type):
return pa.list_(
pa.field(
"rings",
pa.list_(pa.field("vertices", point_type, nullable=False)),
nullable=False,
)
)
def _multipoint_type(point_type):
return pa.list_(pa.field("points", point_type, nullable=False))
def _multilinestring_type(point_type):
return pa.list_(
pa.field("linestrings", _linestring_type(point_type), nullable=False)
)
def _multipolygon_type(point_type):
return pa.list_(pa.field("polygons", _polygon_type(point_type), nullable=False))
def construct_geometry_array(
shapely_arr: NDArray[np.object_],
include_z: Optional[bool] = None,
*,
field_name: str = "geometry",
crs: Optional[str] = None,
interleaved: bool = True,
) -> Tuple[pa.Field, pa.Array]:
# NOTE: this implementation returns a (field, array) pair so that it can set the
# extension metadata on the field without instantiating extension types into the
# global pyarrow registry
geom_type, coords, offsets = shapely.to_ragged_array(
shapely_arr, include_z=include_z
)
mask = shapely.is_missing(shapely_arr)
if mask.any():
if (
geom_type == GeometryType.POINT
and interleaved
and Version(pa.__version__) < Version("15.0.0")
):
raise ValueError(
"Converting point geometries with missing values is not supported "
"for interleaved coordinates with pyarrow < 15.0.0. Please "
"upgrade to a newer version of pyarrow."
)
mask = pa.array(mask, type=pa.bool_())
if geom_type == GeometryType.POINT and not SHAPELY_GE_204:
# bug in shapely < 2.0.4, see https://github.com/shapely/shapely/pull/2034
# this workaround only works if there are no empty points
indices = np.nonzero(mask)[0]
indices = indices - np.arange(len(indices))
coords = np.insert(coords, indices, np.nan, axis=0)
else:
mask = None
if coords.shape[-1] == 2:
dims = "xy"
elif coords.shape[-1] == 3:
dims = "xyz"
else:
raise ValueError(f"Unexpected coords dimensions: {coords.shape}")
extension_metadata: Dict[str, str] = {}
if crs is not None:
extension_metadata["ARROW:extension:metadata"] = json.dumps(
{"crs": crs.to_json()}
)
else:
# In theory this should not be needed, but otherwise pyarrow < 17
# crashes on receiving such data through C Data Interface
# https://github.com/apache/arrow/issues/41741
extension_metadata["ARROW:extension:metadata"] = "{}"
if geom_type == GeometryType.POINT:
parr = _convert_inner_coords(coords, interleaved, dims, mask=mask)
extension_metadata["ARROW:extension:name"] = "geoarrow.point"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.LINESTRING:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, _linestring_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.linestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.POLYGON:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_polygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.polygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOINT:
assert len(offsets) == 1, "Expected one offsets array"
(geom_offsets,) = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
parr = pa.ListArray.from_arrays(
pa.array(geom_offsets), _parr, type=_multipoint_type(_parr.type), mask=mask
)
extension_metadata["ARROW:extension:name"] = "geoarrow.multipoint"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTILINESTRING:
assert len(offsets) == 2, "Expected two offsets arrays"
ring_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr1, mask=mask)
parr = parr.cast(_multilinestring_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multilinestring"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
elif geom_type == GeometryType.MULTIPOLYGON:
assert len(offsets) == 3, "Expected three offsets arrays"
ring_offsets, polygon_offsets, geom_offsets = offsets
_parr = _convert_inner_coords(coords, interleaved, dims)
_parr1 = pa.ListArray.from_arrays(pa.array(ring_offsets), _parr)
_parr2 = pa.ListArray.from_arrays(pa.array(polygon_offsets), _parr1)
parr = pa.ListArray.from_arrays(pa.array(geom_offsets), _parr2, mask=mask)
parr = parr.cast(_multipolygon_type(_parr.type))
extension_metadata["ARROW:extension:name"] = "geoarrow.multipolygon"
field = pa.field(
field_name,
parr.type,
nullable=True,
metadata=extension_metadata,
)
return field, parr
else:
raise ValueError(f"Unsupported type for geoarrow: {geom_type}")
## GeoArrow -> GeoPandas
def _get_arrow_geometry_field(field):
if (meta := field.metadata) is not None:
if (ext_name := meta.get(b"ARROW:extension:name", None)) is not None:
if ext_name.startswith(b"geoarrow."):
if (
ext_meta := meta.get(b"ARROW:extension:metadata", None)
) is not None:
ext_meta = json.loads(ext_meta.decode())
return ext_name.decode(), ext_meta
if isinstance(field.type, pa.ExtensionType):
ext_name = field.type.extension_name
if ext_name.startswith("geoarrow."):
ext_meta_ser = field.type.__arrow_ext_serialize__()
if ext_meta_ser:
ext_meta = json.loads(ext_meta_ser.decode())
else:
ext_meta = None
return ext_name, ext_meta
return None
def arrow_to_geopandas(table, geometry=None):
"""
Convert Arrow table object to a GeoDataFrame based on GeoArrow extension types.
Parameters
----------
table : pyarrow.Table
The Arrow table to convert.
geometry : str, default None
The name of the geometry column to set as the active geometry
column. If None, the first geometry column found will be used.
Returns
-------
GeoDataFrame
"""
if not isinstance(table, pa.Table):
table = pa.table(table)
geom_fields = []
for i, field in enumerate(table.schema):
geom = _get_arrow_geometry_field(field)
if geom is not None:
geom_fields.append((i, field.name, *geom))
if len(geom_fields) == 0:
raise ValueError("No geometry column found in the Arrow table.")
table_attr = table.drop([f[1] for f in geom_fields])
df = table_attr.to_pandas()
for i, col, ext_name, ext_meta in geom_fields:
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(
construct_shapely_array(table[col].combine_chunks(), ext_name), crs=crs
)
else:
raise TypeError(f"Unknown GeoArrow extension type: {ext_name}")
df.insert(i, col, geom_arr)
return GeoDataFrame(df, geometry=geometry or geom_fields[0][1])
def arrow_to_geometry_array(arr):
"""
Convert Arrow array object (representing single GeoArrow array) to a
geopandas GeometryArray.
Specifically for GeoSeries.from_arrow.
"""
if Version(pa.__version__) < Version("14.0.0"):
raise ValueError("Importing from Arrow requires pyarrow >= 14.0.")
schema_capsule, array_capsule = arr.__arrow_c_array__()
field = pa.Field._import_from_c_capsule(schema_capsule)
pa_arr = pa.Array._import_from_c_capsule(field.__arrow_c_schema__(), array_capsule)
geom_info = _get_arrow_geometry_field(field)
if geom_info is None:
raise ValueError("No GeoArrow geometry field found.")
ext_name, ext_meta = geom_info
crs = None
if ext_meta is not None and "crs" in ext_meta:
crs = ext_meta["crs"]
if ext_name == "geoarrow.wkb":
geom_arr = from_wkb(np.array(pa_arr), crs=crs)
elif ext_name.split(".")[1] in GEOARROW_ENCODINGS:
geom_arr = from_shapely(construct_shapely_array(pa_arr, ext_name), crs=crs)
else:
raise ValueError(f"Unknown GeoArrow extension type: {ext_name}")
return geom_arr
def _get_inner_coords(arr):
if pa.types.is_struct(arr.type):
if arr.type.num_fields == 2:
coords = np.column_stack(
[np.asarray(arr.field("x")), np.asarray(arr.field("y"))]
)
else:
coords = np.column_stack(
[
np.asarray(arr.field("x")),
np.asarray(arr.field("y")),
np.asarray(arr.field("z")),
]
)
return coords
else:
# fixed size list
return np.asarray(arr.values).reshape(len(arr), -1)
def construct_shapely_array(arr: pa.Array, extension_name: str):
"""
Construct a NumPy array of shapely geometries from a pyarrow.Array
with GeoArrow extension type.
"""
if isinstance(arr, pa.ExtensionArray):
arr = arr.storage
if extension_name == "geoarrow.point":
coords = _get_inner_coords(arr)
result = shapely.from_ragged_array(GeometryType.POINT, coords, None)
elif extension_name == "geoarrow.linestring":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.LINESTRING, coords, offsets)
elif extension_name == "geoarrow.polygon":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(GeometryType.POLYGON, coords, offsets)
elif extension_name == "geoarrow.multipoint":
coords = _get_inner_coords(arr.values)
offsets1 = np.asarray(arr.offsets)
offsets = (offsets1,)
result = shapely.from_ragged_array(GeometryType.MULTIPOINT, coords, offsets)
elif extension_name == "geoarrow.multilinestring":
coords = _get_inner_coords(arr.values.values)
offsets2 = np.asarray(arr.offsets)
offsets1 = np.asarray(arr.values.offsets)
offsets = (offsets1, offsets2)
result = shapely.from_ragged_array(
GeometryType.MULTILINESTRING, coords, offsets
)
elif extension_name == "geoarrow.multipolygon":
coords = _get_inner_coords(arr.values.values.values)
offsets3 = np.asarray(arr.offsets)
offsets2 = np.asarray(arr.values.offsets)
offsets1 = np.asarray(arr.values.values.offsets)
offsets = (offsets1, offsets2, offsets3)
result = shapely.from_ragged_array(GeometryType.MULTIPOLYGON, coords, offsets)
else:
raise ValueError(extension_name)
# apply validity mask
if arr.null_count:
mask = np.asarray(arr.is_null())
result = np.where(mask, None, result)
return result

View File

@@ -0,0 +1,72 @@
from packaging.version import Version
import pyarrow
_ERROR_MSG = """\
Disallowed deserialization of 'arrow.py_extension_type':
storage_type = {storage_type}
serialized = {serialized}
pickle disassembly:\n{pickle_disassembly}
Reading of untrusted Parquet or Feather files with a PyExtensionType column
allows arbitrary code execution.
If you trust this file, you can enable reading the extension type by one of:
- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)`
- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running
`import pyarrow_hotfix; pyarrow_hotfix.uninstall()`
We strongly recommend updating your Parquet/Feather files to use extension types
derived from `pyarrow.ExtensionType` instead, and register this type explicitly.
See https://arrow.apache.org/docs/dev/python/extending_types.html#defining-extension-types-user-defined-types
for more details.
"""
def patch_pyarrow():
# starting from pyarrow 14.0.1, it has its own mechanism
if Version(pyarrow.__version__) >= Version("14.0.1"):
return
# if the user has pyarrow_hotfix (https://github.com/pitrou/pyarrow-hotfix)
# installed, use this instead (which also ensures it works if they had
# called `pyarrow_hotfix.uninstall()`)
try:
import pyarrow_hotfix # noqa: F401
except ImportError:
pass
else:
return
# if the hotfix is already installed and enabled
if getattr(pyarrow, "_hotfix_installed", False):
return
class ForbiddenExtensionType(pyarrow.ExtensionType):
def __arrow_ext_serialize__(self):
return b""
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
import io
import pickletools
out = io.StringIO()
pickletools.dis(serialized, out)
raise RuntimeError(
_ERROR_MSG.format(
storage_type=storage_type,
serialized=serialized,
pickle_disassembly=out.getvalue(),
)
)
pyarrow.unregister_extension_type("arrow.py_extension_type")
pyarrow.register_extension_type(
ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type")
)
pyarrow._hotfix_installed = True
patch_pyarrow()

View File

@@ -0,0 +1,913 @@
import json
import warnings
from packaging.version import Version
import numpy as np
from pandas import DataFrame, Series
import shapely
import geopandas
from geopandas import GeoDataFrame
from geopandas._compat import import_optional_dependency
from geopandas.array import from_shapely, from_wkb
from .file import _expand_user
METADATA_VERSION = "1.0.0"
SUPPORTED_VERSIONS = ["0.1.0", "0.4.0", "1.0.0-beta.1", "1.0.0", "1.1.0"]
GEOARROW_ENCODINGS = [
"point",
"linestring",
"polygon",
"multipoint",
"multilinestring",
"multipolygon",
]
SUPPORTED_ENCODINGS = ["WKB"] + GEOARROW_ENCODINGS
# reference: https://github.com/opengeospatial/geoparquet
# Metadata structure:
# {
# "geo": {
# "columns": {
# "<name>": {
# "encoding": "WKB"
# "geometry_types": <list of str: REQUIRED>
# "crs": "<PROJJSON or None: OPTIONAL>",
# "orientation": "<'counterclockwise' or None: OPTIONAL>"
# "edges": "planar"
# "bbox": <list of [xmin, ymin, xmax, ymax]: OPTIONAL>
# "epoch": <float: OPTIONAL>
# }
# },
# "primary_column": "<str: REQUIRED>",
# "version": "<METADATA_VERSION>",
#
# # Additional GeoPandas specific metadata (not in metadata spec)
# "creator": {
# "library": "geopandas",
# "version": "<geopandas.__version__>"
# }
# }
# }
def _is_fsspec_url(url):
return (
isinstance(url, str)
and "://" in url
and not url.startswith(("http://", "https://"))
)
def _remove_id_from_member_of_ensembles(json_dict):
"""
Older PROJ versions will not recognize IDs of datum ensemble members that
were added in more recent PROJ database versions.
Cf https://github.com/opengeospatial/geoparquet/discussions/110
and https://github.com/OSGeo/PROJ/pull/3221
Mimicking the patch to GDAL from https://github.com/OSGeo/gdal/pull/5872
"""
for key, value in json_dict.items():
if isinstance(value, dict):
_remove_id_from_member_of_ensembles(value)
elif key == "members" and isinstance(value, list):
for member in value:
member.pop("id", None)
# type ids 0 to 7
_geometry_type_names = [
"Point",
"LineString",
"LineString",
"Polygon",
"MultiPoint",
"MultiLineString",
"MultiPolygon",
"GeometryCollection",
]
_geometry_type_names += [geom_type + " Z" for geom_type in _geometry_type_names]
def _get_geometry_types(series):
"""
Get unique geometry types from a GeoSeries.
"""
arr_geometry_types = shapely.get_type_id(series.array._data)
# ensure to include "... Z" for 3D geometries
has_z = shapely.has_z(series.array._data)
arr_geometry_types[has_z] += 8
geometry_types = Series(arr_geometry_types).unique().tolist()
# drop missing values (shapely.get_type_id returns -1 for those)
if -1 in geometry_types:
geometry_types.remove(-1)
return sorted([_geometry_type_names[idx] for idx in geometry_types])
def _create_metadata(
df, schema_version=None, geometry_encoding=None, write_covering_bbox=False
):
"""Create and encode geo metadata dict.
Parameters
----------
df : GeoDataFrame
schema_version : {'0.1.0', '0.4.0', '1.0.0-beta.1', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
Returns
-------
dict
"""
if schema_version is None:
if geometry_encoding and any(
encoding != "WKB" for encoding in geometry_encoding.values()
):
schema_version = "1.1.0"
else:
schema_version = METADATA_VERSION
if schema_version not in SUPPORTED_VERSIONS:
raise ValueError(
f"schema_version must be one of: {', '.join(SUPPORTED_VERSIONS)}"
)
# Construct metadata for each geometry
column_metadata = {}
for col in df.columns[df.dtypes == "geometry"]:
series = df[col]
geometry_types = _get_geometry_types(series)
if schema_version[0] == "0":
geometry_types_name = "geometry_type"
if len(geometry_types) == 1:
geometry_types = geometry_types[0]
else:
geometry_types_name = "geometry_types"
crs = None
if series.crs:
if schema_version == "0.1.0":
crs = series.crs.to_wkt()
else: # version >= 0.4.0
crs = series.crs.to_json_dict()
_remove_id_from_member_of_ensembles(crs)
column_metadata[col] = {
"encoding": geometry_encoding[col],
"crs": crs,
geometry_types_name: geometry_types,
}
bbox = series.total_bounds.tolist()
if np.isfinite(bbox).all():
# don't add bbox with NaNs for empty / all-NA geometry column
column_metadata[col]["bbox"] = bbox
if write_covering_bbox:
column_metadata[col]["covering"] = {
"bbox": {
"xmin": ["bbox", "xmin"],
"ymin": ["bbox", "ymin"],
"xmax": ["bbox", "xmax"],
"ymax": ["bbox", "ymax"],
},
}
return {
"primary_column": df._geometry_column_name,
"columns": column_metadata,
"version": schema_version,
"creator": {"library": "geopandas", "version": geopandas.__version__},
}
def _encode_metadata(metadata):
"""Encode metadata dict to UTF-8 JSON string
Parameters
----------
metadata : dict
Returns
-------
UTF-8 encoded JSON string
"""
return json.dumps(metadata).encode("utf-8")
def _decode_metadata(metadata_str):
"""Decode a UTF-8 encoded JSON string to dict
Parameters
----------
metadata_str : string (UTF-8 encoded)
Returns
-------
dict
"""
if metadata_str is None:
return None
return json.loads(metadata_str.decode("utf-8"))
def _validate_dataframe(df):
"""Validate that the GeoDataFrame conforms to requirements for writing
to Parquet format.
Raises `ValueError` if the GeoDataFrame is not valid.
copied from `pandas.io.parquet`
Parameters
----------
df : GeoDataFrame
"""
if not isinstance(df, DataFrame):
raise ValueError("Writing to Parquet/Feather only supports IO with DataFrames")
# must have value column names (strings only)
if df.columns.inferred_type not in {"string", "unicode", "empty"}:
raise ValueError("Writing to Parquet/Feather requires string column names")
# index level names must be strings
valid_names = all(
isinstance(name, str) for name in df.index.names if name is not None
)
if not valid_names:
raise ValueError("Index level names must be strings")
def _validate_geo_metadata(metadata):
"""Validate geo metadata.
Must not be empty, and must contain the structure specified above.
Raises ValueError if metadata is not valid.
Parameters
----------
metadata : dict
"""
if not metadata:
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
# version was schema_version in 0.1.0
version = metadata.get("version", metadata.get("schema_version"))
if not version:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'version'"
)
required_keys = ("primary_column", "columns")
for key in required_keys:
if metadata.get(key, None) is None:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key: "
"'{key}'".format(key=key)
)
if not isinstance(metadata["columns"], dict):
raise ValueError("'columns' in 'geo' metadata must be a dict")
# Validate that geometry columns have required metadata and values
# leaving out "geometry_type" for compatibility with 0.1
required_col_keys = ("encoding",)
for col, column_metadata in metadata["columns"].items():
for key in required_col_keys:
if key not in column_metadata:
raise ValueError(
"'geo' metadata in Parquet/Feather file is missing required key "
"'{key}' for column '{col}'".format(key=key, col=col)
)
if column_metadata["encoding"] not in SUPPORTED_ENCODINGS:
raise ValueError(
"Only WKB geometry encoding or one of the native encodings "
f"({GEOARROW_ENCODINGS!r}) are supported, "
f"got: {column_metadata['encoding']}"
)
if column_metadata.get("edges", "planar") == "spherical":
warnings.warn(
f"The geo metadata indicate that column '{col}' has spherical edges, "
"but because GeoPandas currently does not support spherical "
"geometry, it ignores this metadata and will interpret the edges of "
"the geometries as planar.",
UserWarning,
stacklevel=4,
)
if "covering" in column_metadata:
covering = column_metadata["covering"]
if "bbox" in covering:
bbox = covering["bbox"]
for var in ["xmin", "ymin", "xmax", "ymax"]:
if var not in bbox.keys():
raise ValueError("Metadata for bbox column is malformed.")
def _geopandas_to_arrow(
df,
index=None,
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=None,
):
"""
Helper function with main, shared logic for to_parquet/to_feather.
"""
from pyarrow import StructArray
from geopandas.io._geoarrow import geopandas_to_arrow
_validate_dataframe(df)
if schema_version is not None:
if geometry_encoding != "WKB" and schema_version != "1.1.0":
raise ValueError(
"'geoarrow' encoding is only supported with schema version >= 1.1.0"
)
table, geometry_encoding_dict = geopandas_to_arrow(
df, geometry_encoding=geometry_encoding, index=index, interleaved=False
)
geo_metadata = _create_metadata(
df,
schema_version=schema_version,
geometry_encoding=geometry_encoding_dict,
write_covering_bbox=write_covering_bbox,
)
if write_covering_bbox:
if "bbox" in df.columns:
raise ValueError(
"An existing column 'bbox' already exists in the dataframe. "
"Please rename to write covering bbox."
)
bounds = df.bounds
bbox_array = StructArray.from_arrays(
[bounds["minx"], bounds["miny"], bounds["maxx"], bounds["maxy"]],
names=["xmin", "ymin", "xmax", "ymax"],
)
table = table.append_column("bbox", bbox_array)
# Store geopandas specific file-level metadata
# This must be done AFTER creating the table or it is not persisted
metadata = table.schema.metadata
metadata.update({b"geo": _encode_metadata(geo_metadata)})
return table.replace_schema_metadata(metadata)
def _to_parquet(
df,
path,
index=None,
compression="snappy",
geometry_encoding="WKB",
schema_version=None,
write_covering_bbox=False,
**kwargs,
):
"""
Write a GeoDataFrame to the Parquet format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow'.
This is tracking version 1.0.0 of the GeoParquet specification at:
https://github.com/opengeospatial/geoparquet. Writing older versions is
supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
geometry_encoding : {'WKB', 'geoarrow'}, default 'WKB'
The encoding to use for the geometry columns. Defaults to "WKB"
for maximum interoperability. Specify "geoarrow" to use one of the
native GeoArrow-based single-geometry type encodings.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version; if not provided will default to
latest supported version.
write_covering_bbox : bool, default False
Writes the bounding box column for each row entry with column
name 'bbox'. Writing a bbox column can be computationally
expensive, hence is default setting is False.
**kwargs
Additional keyword arguments passed to pyarrow.parquet.write_table().
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
path = _expand_user(path)
table = _geopandas_to_arrow(
df,
index=index,
geometry_encoding=geometry_encoding,
schema_version=schema_version,
write_covering_bbox=write_covering_bbox,
)
parquet.write_table(table, path, compression=compression, **kwargs)
def _to_feather(df, path, index=None, compression=None, schema_version=None, **kwargs):
"""
Write a GeoDataFrame to the Feather format.
Any geometry columns present are serialized to WKB format in the file.
Requires 'pyarrow' >= 0.17.
This is tracking version 1.0.0 of the GeoParquet specification for
the metadata at: https://github.com/opengeospatial/geoparquet. Writing
older versions is supported using the `schema_version` keyword.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
index : bool, default None
If ``True``, always include the dataframe's index(es) as columns
in the file output.
If ``False``, the index(es) will not be written to the file.
If ``None``, the index(ex) will be included as columns in the file
output except `RangeIndex` which is stored as metadata only.
compression : {'zstd', 'lz4', 'uncompressed'}, optional
Name of the compression to use. Use ``"uncompressed"`` for no
compression. By default uses LZ4 if available, otherwise uncompressed.
schema_version : {'0.1.0', '0.4.0', '1.0.0', None}
GeoParquet specification version for the metadata; if not provided
will default to latest supported version.
kwargs
Additional keyword arguments passed to pyarrow.feather.write_feather().
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = _geopandas_to_arrow(df, index=index, schema_version=schema_version)
feather.write_feather(table, path, compression=compression, **kwargs)
def _arrow_to_geopandas(table, geo_metadata=None):
"""
Helper function with main, shared logic for read_parquet/read_feather.
"""
if geo_metadata is None:
# Note: this path of not passing metadata is also used by dask-geopandas
geo_metadata = _validate_and_decode_metadata(table.schema.metadata)
# Find all geometry columns that were read from the file. May
# be a subset if 'columns' parameter is used.
geometry_columns = [
col for col in geo_metadata["columns"] if col in table.column_names
]
result_column_names = list(table.slice(0, 0).to_pandas().columns)
geometry_columns.sort(key=result_column_names.index)
if not len(geometry_columns):
raise ValueError(
"""No geometry columns are included in the columns read from
the Parquet/Feather file. To read this file without geometry columns,
use pandas.read_parquet/read_feather() instead."""
)
geometry = geo_metadata["primary_column"]
# Missing geometry likely indicates a subset of columns was read;
# promote the first available geometry to the primary geometry.
if len(geometry_columns) and geometry not in geometry_columns:
geometry = geometry_columns[0]
# if there are multiple non-primary geometry columns, raise a warning
if len(geometry_columns) > 1:
warnings.warn(
"Multiple non-primary geometry columns read from Parquet/Feather "
"file. The first column read was promoted to the primary geometry.",
stacklevel=3,
)
table_attr = table.drop(geometry_columns)
df = table_attr.to_pandas()
# Convert the WKB columns that are present back to geometry.
for col in geometry_columns:
col_metadata = geo_metadata["columns"][col]
if "crs" in col_metadata:
crs = col_metadata["crs"]
if isinstance(crs, dict):
_remove_id_from_member_of_ensembles(crs)
else:
# per the GeoParquet spec, missing CRS is to be interpreted as
# OGC:CRS84
crs = "OGC:CRS84"
if col_metadata["encoding"] == "WKB":
geom_arr = from_wkb(np.array(table[col]), crs=crs)
else:
from geopandas.io._geoarrow import construct_shapely_array
geom_arr = from_shapely(
construct_shapely_array(
table[col].combine_chunks(), "geoarrow." + col_metadata["encoding"]
),
crs=crs,
)
df.insert(result_column_names.index(col), col, geom_arr)
return GeoDataFrame(df, geometry=geometry)
def _get_filesystem_path(path, filesystem=None, storage_options=None):
"""
Get the filesystem and path for a given filesystem and path.
If the filesystem is not None then it's just returned as is.
"""
import pyarrow
if (
isinstance(path, str)
and storage_options is None
and filesystem is None
and Version(pyarrow.__version__) >= Version("5.0.0")
):
# Use the native pyarrow filesystem if possible.
try:
from pyarrow.fs import FileSystem
filesystem, path = FileSystem.from_uri(path)
except Exception:
# fallback to use get_handle / fsspec for filesystems
# that pyarrow doesn't support
pass
if _is_fsspec_url(path) and filesystem is None:
fsspec = import_optional_dependency(
"fsspec", extra="fsspec is requred for 'storage_options'."
)
filesystem, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
if filesystem is None and storage_options:
raise ValueError(
"Cannot provide 'storage_options' with non-fsspec path '{}'".format(path)
)
return filesystem, path
def _ensure_arrow_fs(filesystem):
"""
Simplified version of pyarrow.fs._ensure_filesystem. This is only needed
below because `pyarrow.parquet.read_metadata` does not yet accept a
filesystem keyword (https://issues.apache.org/jira/browse/ARROW-16719)
"""
from pyarrow import fs
if isinstance(filesystem, fs.FileSystem):
return filesystem
# handle fsspec-compatible filesystems
try:
import fsspec
except ImportError:
pass
else:
if isinstance(filesystem, fsspec.AbstractFileSystem):
return fs.PyFileSystem(fs.FSSpecHandler(filesystem))
return filesystem
def _validate_and_decode_metadata(metadata):
if metadata is None or b"geo" not in metadata:
raise ValueError(
"""Missing geo metadata in Parquet/Feather file.
Use pandas.read_parquet/read_feather() instead."""
)
# check for malformed metadata
try:
decoded_geo_metadata = _decode_metadata(metadata.get(b"geo", b""))
except (TypeError, json.decoder.JSONDecodeError):
raise ValueError("Missing or malformed geo metadata in Parquet/Feather file")
_validate_geo_metadata(decoded_geo_metadata)
return decoded_geo_metadata
def _read_parquet_schema_and_metadata(path, filesystem):
"""
Opening the Parquet file/dataset a first time to get the schema and metadata.
TODO: we should look into how we can reuse opened dataset for reading the
actual data, to avoid discovering the dataset twice (problem right now is
that the ParquetDataset interface doesn't allow passing the filters on read)
"""
import pyarrow
from pyarrow import parquet
kwargs = {}
if Version(pyarrow.__version__) < Version("15.0.0"):
kwargs = dict(use_legacy_dataset=False)
try:
schema = parquet.ParquetDataset(path, filesystem=filesystem, **kwargs).schema
except Exception:
schema = parquet.read_schema(path, filesystem=filesystem)
metadata = schema.metadata
# read metadata separately to get the raw Parquet FileMetaData metadata
# (pyarrow doesn't properly exposes those in schema.metadata for files
# created by GDAL - https://issues.apache.org/jira/browse/ARROW-16688)
if metadata is None or b"geo" not in metadata:
try:
metadata = parquet.read_metadata(path, filesystem=filesystem).metadata
except Exception:
pass
return schema, metadata
def _read_parquet(path, columns=None, storage_options=None, bbox=None, **kwargs):
"""
Load a Parquet object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_parquet` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the GeoParquet metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow'.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
storage_options : dict, optional
Extra options that make sense for a particular storage connection, e.g. host,
port, username, password, etc. For HTTP(S) URLs the key-value pairs are
forwarded to urllib as header options. For other URLs (e.g. starting with
"s3://", and "gcs://") the key-value pairs are forwarded to fsspec. Please
see fsspec and urllib for more details.
When no storage options are provided and a filesystem is implemented by
both ``pyarrow.fs`` and ``fsspec`` (e.g. "s3://") then the ``pyarrow.fs``
filesystem is preferred. Provide the instantiated fsspec filesystem using
the ``filesystem`` keyword if you wish to use its implementation.
bbox : tuple, optional
Bounding box to be used to filter selection from geoparquet data. This
is only usable if the data was saved with the bbox covering metadata.
Input is of the tuple format (xmin, ymin, xmax, ymax).
**kwargs
Any additional kwargs passed to :func:`pyarrow.parquet.read_table`.
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_parquet("data.parquet") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_parquet(
... "data.parquet",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
import geopandas.io._pyarrow_hotfix # noqa: F401
# TODO(https://github.com/pandas-dev/pandas/pull/41194): see if pandas
# adds filesystem as a keyword and match that.
filesystem = kwargs.pop("filesystem", None)
filesystem, path = _get_filesystem_path(
path, filesystem=filesystem, storage_options=storage_options
)
path = _expand_user(path)
schema, metadata = _read_parquet_schema_and_metadata(path, filesystem)
geo_metadata = _validate_and_decode_metadata(metadata)
bbox_filter = (
_get_parquet_bbox_filter(geo_metadata, bbox) if bbox is not None else None
)
if_bbox_column_exists = _check_if_covering_in_geo_metadata(geo_metadata)
# by default, bbox column is not read in, so must specify which
# columns are read in if it exists.
if not columns and if_bbox_column_exists:
columns = _get_non_bbox_columns(schema, geo_metadata)
# if both bbox and filters kwargs are used, must splice together.
if "filters" in kwargs:
filters_kwarg = kwargs.pop("filters")
filters = _splice_bbox_and_filters(filters_kwarg, bbox_filter)
else:
filters = bbox_filter
kwargs["use_pandas_metadata"] = True
table = parquet.read_table(
path, columns=columns, filesystem=filesystem, filters=filters, **kwargs
)
return _arrow_to_geopandas(table, geo_metadata)
def _read_feather(path, columns=None, **kwargs):
"""
Load a Feather object from the file path, returning a GeoDataFrame.
You can read a subset of columns in the file using the ``columns`` parameter.
However, the structure of the returned GeoDataFrame will depend on which
columns you read:
* if no geometry columns are read, this will raise a ``ValueError`` - you
should use the pandas `read_feather` method instead.
* if the primary geometry column saved to this file is not included in
columns, the first available geometry column will be set as the geometry
column of the returned GeoDataFrame.
Supports versions 0.1.0, 0.4.0 and 1.0.0 of the GeoParquet
specification at: https://github.com/opengeospatial/geoparquet
If 'crs' key is not present in the Feather metadata associated with the
Parquet object, it will default to "OGC:CRS84" according to the specification.
Requires 'pyarrow' >= 0.17.
.. versionadded:: 0.8
Parameters
----------
path : str, path object
columns : list-like of strings, default=None
If not None, only these columns will be read from the file. If
the primary geometry column is not included, the first secondary
geometry read from the file will be set as the geometry column
of the returned GeoDataFrame. If no geometry columns are present,
a ``ValueError`` will be raised.
**kwargs
Any additional kwargs passed to pyarrow.feather.read_table().
Returns
-------
GeoDataFrame
Examples
--------
>>> df = geopandas.read_feather("data.feather") # doctest: +SKIP
Specifying columns to read:
>>> df = geopandas.read_feather(
... "data.feather",
... columns=["geometry", "pop_est"]
... ) # doctest: +SKIP
"""
feather = import_optional_dependency(
"pyarrow.feather", extra="pyarrow is required for Feather support."
)
# TODO move this into `import_optional_dependency`
import pyarrow
import geopandas.io._pyarrow_hotfix # noqa: F401
if Version(pyarrow.__version__) < Version("0.17.0"):
raise ImportError("pyarrow >= 0.17 required for Feather support")
path = _expand_user(path)
table = feather.read_table(path, columns=columns, **kwargs)
return _arrow_to_geopandas(table)
def _get_parquet_bbox_filter(geo_metadata, bbox):
primary_column = geo_metadata["primary_column"]
if _check_if_covering_in_geo_metadata(geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
return _convert_bbox_to_parquet_filter(bbox, bbox_column_name)
elif geo_metadata["columns"][primary_column]["encoding"] == "point":
import pyarrow.compute as pc
return (
(pc.field((primary_column, "x")) >= bbox[0])
& (pc.field((primary_column, "x")) <= bbox[2])
& (pc.field((primary_column, "y")) >= bbox[1])
& (pc.field((primary_column, "y")) <= bbox[3])
)
else:
raise ValueError(
"Specifying 'bbox' not supported for this Parquet file (it should either "
"have a bbox covering column or use 'point' encoding)."
)
def _convert_bbox_to_parquet_filter(bbox, bbox_column_name):
import pyarrow.compute as pc
return ~(
(pc.field((bbox_column_name, "xmin")) > bbox[2])
| (pc.field((bbox_column_name, "ymin")) > bbox[3])
| (pc.field((bbox_column_name, "xmax")) < bbox[0])
| (pc.field((bbox_column_name, "ymax")) < bbox[1])
)
def _check_if_covering_in_geo_metadata(geo_metadata):
primary_column = geo_metadata["primary_column"]
return "covering" in geo_metadata["columns"][primary_column].keys()
def _get_bbox_encoding_column_name(geo_metadata):
primary_column = geo_metadata["primary_column"]
return geo_metadata["columns"][primary_column]["covering"]["bbox"]["xmin"][0]
def _get_non_bbox_columns(schema, geo_metadata):
bbox_column_name = _get_bbox_encoding_column_name(geo_metadata)
columns = schema.names
if bbox_column_name in columns:
columns.remove(bbox_column_name)
return columns
def _splice_bbox_and_filters(kwarg_filters, bbox_filter):
parquet = import_optional_dependency(
"pyarrow.parquet", extra="pyarrow is required for Parquet support."
)
if bbox_filter is None:
return kwarg_filters
filters_expression = parquet.filters_to_expression(kwarg_filters)
return bbox_filter & filters_expression

View File

@@ -0,0 +1,851 @@
from __future__ import annotations
import os
import urllib.request
import warnings
from io import IOBase
from packaging.version import Version
from pathlib import Path
# Adapted from pandas.io.common
from urllib.parse import urlparse as parse_url
from urllib.parse import uses_netloc, uses_params, uses_relative
import numpy as np
import pandas as pd
from pandas.api.types import is_integer_dtype
import shapely
from shapely.geometry import mapping
from shapely.geometry.base import BaseGeometry
from geopandas import GeoDataFrame, GeoSeries
from geopandas._compat import HAS_PYPROJ, PANDAS_GE_20
from geopandas.io.util import vsi_path
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
_VALID_URLS.discard("")
# file:// URIs are supported by fiona/pyogrio -> don't already open + read the file here
_VALID_URLS.discard("file")
fiona = None
fiona_env = None
fiona_import_error = None
FIONA_GE_19 = False
def _import_fiona():
global fiona
global fiona_env
global fiona_import_error
global FIONA_GE_19
if fiona is None:
try:
import fiona
# only try to import fiona.Env if the main fiona import succeeded
# (otherwise you can get confusing "AttributeError: module 'fiona'
# has no attribute '_loading'" / partially initialized module errors)
try:
from fiona import Env as fiona_env
except ImportError:
try:
from fiona import drivers as fiona_env
except ImportError:
fiona_env = None
FIONA_GE_19 = Version(Version(fiona.__version__).base_version) >= Version(
"1.9.0"
)
except ImportError as err:
fiona = False
fiona_import_error = str(err)
pyogrio = None
pyogrio_import_error = None
def _import_pyogrio():
global pyogrio
global pyogrio_import_error
if pyogrio is None:
try:
import pyogrio
except ImportError as err:
pyogrio = False
pyogrio_import_error = str(err)
def _check_fiona(func):
if not fiona:
raise ImportError(
f"the {func} requires the 'fiona' package, but it is not installed or does "
f"not import correctly.\nImporting fiona resulted in: {fiona_import_error}"
)
def _check_pyogrio(func):
if not pyogrio:
raise ImportError(
f"the {func} requires the 'pyogrio' package, but it is not installed "
"or does not import correctly."
"\nImporting pyogrio resulted in: {pyogrio_import_error}"
)
def _check_metadata_supported(metadata: str | None, engine: str, driver: str) -> None:
if metadata is None:
return
if driver != "GPKG":
raise NotImplementedError(
"The 'metadata' keyword is only supported for the GPKG driver."
)
if engine == "fiona" and not FIONA_GE_19:
raise NotImplementedError(
"The 'metadata' keyword is only supported for Fiona >= 1.9."
)
def _check_engine(engine, func):
# if not specified through keyword or option, then default to "pyogrio" if
# installed, otherwise try fiona
if engine is None:
import geopandas
engine = geopandas.options.io_engine
if engine is None:
_import_pyogrio()
if pyogrio:
engine = "pyogrio"
else:
_import_fiona()
if fiona:
engine = "fiona"
if engine == "pyogrio":
_import_pyogrio()
_check_pyogrio(func)
elif engine == "fiona":
_import_fiona()
_check_fiona(func)
elif engine is None:
raise ImportError(
f"The {func} requires the 'pyogrio' or 'fiona' package, "
"but neither is installed or imports correctly."
f"\nImporting pyogrio resulted in: {pyogrio_import_error}"
f"\nImporting fiona resulted in: {fiona_import_error}"
)
return engine
_EXTENSION_TO_DRIVER = {
".bna": "BNA",
".dxf": "DXF",
".csv": "CSV",
".shp": "ESRI Shapefile",
".dbf": "ESRI Shapefile",
".json": "GeoJSON",
".geojson": "GeoJSON",
".geojsonl": "GeoJSONSeq",
".geojsons": "GeoJSONSeq",
".gpkg": "GPKG",
".gml": "GML",
".xml": "GML",
".gpx": "GPX",
".gtm": "GPSTrackMaker",
".gtz": "GPSTrackMaker",
".tab": "MapInfo File",
".mif": "MapInfo File",
".mid": "MapInfo File",
".dgn": "DGN",
".fgb": "FlatGeobuf",
}
def _expand_user(path):
"""Expand paths that use ~."""
if isinstance(path, str):
path = os.path.expanduser(path)
elif isinstance(path, Path):
path = path.expanduser()
return path
def _is_url(url):
"""Check to see if *url* has a valid protocol."""
try:
return parse_url(url).scheme in _VALID_URLS
except Exception:
return False
def _read_file(
filename, bbox=None, mask=None, columns=None, rows=None, engine=None, **kwargs
):
"""
Returns a GeoDataFrame from a file or URL.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
bbox : tuple | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter features by given bounding box, GeoSeries, GeoDataFrame or a shapely
geometry. With engine="fiona", CRS mis-matches are resolved if given a GeoSeries
or GeoDataFrame. With engine="pyogrio", bbox must be in the same CRS as the
dataset. Tuple is (minx, miny, maxx, maxy) to match the bounds property of
shapely geometry objects. Cannot be used with mask.
mask : dict | GeoDataFrame or GeoSeries | shapely Geometry, default None
Filter for features that intersect with the given dict-like geojson
geometry, GeoSeries, GeoDataFrame or shapely geometry.
CRS mis-matches are resolved if given a GeoSeries or GeoDataFrame.
Cannot be used with bbox. If multiple geometries are passed, this will
first union all geometries, which may be computationally expensive.
columns : list, optional
List of column names to import from the data source. Column names
must exactly match the names in the data source. To avoid reading
any columns (besides the geometry column), pass an empty list-like.
By default reads all columns.
rows : int or slice, default None
Load in specific rows by passing an integer (first `n` rows) or a
slice() object.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "pyogrio" engine, the keyword arguments are passed to
`pyogrio.write_dataframe`. In case of the "fiona" engine, the keyword
arguments are passed to fiona.open`. For more information on possible
keywords, type: ``import pyogrio; help(pyogrio.write_dataframe)``.
Examples
--------
>>> df = geopandas.read_file("nybb.shp") # doctest: +SKIP
Specifying layer of GPKG:
>>> df = geopandas.read_file("file.gpkg", layer='cities') # doctest: +SKIP
Reading only first 10 rows:
>>> df = geopandas.read_file("nybb.shp", rows=10) # doctest: +SKIP
Reading only geometries intersecting ``mask``:
>>> df = geopandas.read_file("nybb.shp", mask=polygon) # doctest: +SKIP
Reading only geometries intersecting ``bbox``:
>>> df = geopandas.read_file("nybb.shp", bbox=(0, 0, 10, 20)) # doctest: +SKIP
Returns
-------
:obj:`geopandas.GeoDataFrame` or :obj:`pandas.DataFrame` :
If `ignore_geometry=True` a :obj:`pandas.DataFrame` will be returned.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
When specifying a URL, geopandas will check if the server supports reading
partial data and in that case pass the URL as is to the underlying engine,
which will then use the network file system handler of GDAL to read from
the URL. Otherwise geopandas will download the data from the URL and pass
all data in-memory to the underlying engine.
If you need more control over how the URL is read, you can specify the
GDAL virtual filesystem manually (e.g. ``/vsicurl/https://...``). See the
GDAL documentation on filesystems for more details
(https://gdal.org/user/virtual_file_systems.html#vsicurl-http-https-ftp-files-random-access).
"""
engine = _check_engine(engine, "'read_file' function")
filename = _expand_user(filename)
from_bytes = False
if _is_url(filename):
# if it is a url that supports random access -> pass through to
# pyogrio/fiona as is (to support downloading only part of the file)
# otherwise still download manually because pyogrio/fiona don't support
# all types of urls (https://github.com/geopandas/geopandas/issues/2908)
with urllib.request.urlopen(filename) as response:
if not response.headers.get("Accept-Ranges") == "bytes":
filename = response.read()
from_bytes = True
if engine == "pyogrio":
return _read_file_pyogrio(
filename, bbox=bbox, mask=mask, columns=columns, rows=rows, **kwargs
)
elif engine == "fiona":
if pd.api.types.is_file_like(filename):
data = filename.read()
path_or_bytes = data.encode("utf-8") if isinstance(data, str) else data
from_bytes = True
else:
path_or_bytes = filename
return _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=bbox,
mask=mask,
columns=columns,
rows=rows,
**kwargs,
)
else:
raise ValueError(f"unknown engine '{engine}'")
def _read_file_fiona(
path_or_bytes,
from_bytes,
bbox=None,
mask=None,
columns=None,
rows=None,
where=None,
**kwargs,
):
if where is not None and not FIONA_GE_19:
raise NotImplementedError("where requires fiona 1.9+")
if columns is not None:
if "include_fields" in kwargs:
raise ValueError(
"Cannot specify both 'include_fields' and 'columns' keywords"
)
if not FIONA_GE_19:
raise NotImplementedError("'columns' keyword requires fiona 1.9+")
kwargs["include_fields"] = columns
elif "include_fields" in kwargs:
# alias to columns, as this variable is used below to specify column order
# in the dataframe creation
columns = kwargs["include_fields"]
if not from_bytes:
# Opening a file via URL or file-like-object above automatically detects a
# zipped file. In order to match that behavior, attempt to add a zip scheme
# if missing.
path_or_bytes = vsi_path(str(path_or_bytes))
if from_bytes:
reader = fiona.BytesCollection
else:
reader = fiona.open
with fiona_env():
with reader(path_or_bytes, **kwargs) as features:
crs = features.crs_wkt
# attempt to get EPSG code
try:
# fiona 1.9+
epsg = features.crs.to_epsg(confidence_threshold=100)
if epsg is not None:
crs = epsg
except AttributeError:
# fiona <= 1.8
try:
crs = features.crs["init"]
except (TypeError, KeyError):
pass
# handle loading the bounding box
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
assert len(bbox) == 4
# handle loading the mask
elif isinstance(mask, (GeoDataFrame, GeoSeries)):
mask = mapping(mask.to_crs(crs).union_all())
elif isinstance(mask, BaseGeometry):
mask = mapping(mask)
filters = {}
if bbox is not None:
filters["bbox"] = bbox
if mask is not None:
filters["mask"] = mask
if where is not None:
filters["where"] = where
# setup the data loading filter
if rows is not None:
if isinstance(rows, int):
rows = slice(rows)
elif not isinstance(rows, slice):
raise TypeError("'rows' must be an integer or a slice.")
f_filt = features.filter(rows.start, rows.stop, rows.step, **filters)
elif filters:
f_filt = features.filter(**filters)
else:
f_filt = features
# get list of columns
columns = columns or list(features.schema["properties"])
datetime_fields = [
k for (k, v) in features.schema["properties"].items() if v == "datetime"
]
if (
kwargs.get("ignore_geometry", False)
or features.schema["geometry"] == "None"
):
df = pd.DataFrame(
[record["properties"] for record in f_filt], columns=columns
)
else:
df = GeoDataFrame.from_features(
f_filt, crs=crs, columns=columns + ["geometry"]
)
for k in datetime_fields:
as_dt = None
# plain try catch for when pandas will raise in the future
# TODO we can tighten the exception type in future when it does
try:
with warnings.catch_warnings():
# pandas 2.x does not yet enforce this behaviour but raises a
# warning -> we want to to suppress this warning for our users,
# and do this by turning it into an error so we take the
# `except` code path to try again with utc=True
warnings.filterwarnings(
"error",
"In a future version of pandas, parsing datetimes with "
"mixed time zones will raise an error",
FutureWarning,
)
as_dt = pd.to_datetime(df[k])
except Exception:
pass
if as_dt is None or as_dt.dtype == "object":
# if to_datetime failed, try again for mixed timezone offsets
# This can still fail if there are invalid datetimes
try:
as_dt = pd.to_datetime(df[k], utc=True)
except Exception:
pass
# if to_datetime succeeded, round datetimes as
# fiona only supports up to ms precision (any microseconds are
# floating point rounding error)
if as_dt is not None and not (as_dt.dtype == "object"):
if PANDAS_GE_20:
df[k] = as_dt.dt.as_unit("ms")
else:
df[k] = as_dt.dt.round(freq="ms")
return df
def _read_file_pyogrio(path_or_bytes, bbox=None, mask=None, rows=None, **kwargs):
import pyogrio
if rows is not None:
if isinstance(rows, int):
kwargs["max_features"] = rows
elif isinstance(rows, slice):
if rows.start is not None:
if rows.start < 0:
raise ValueError(
"Negative slice start not supported with the 'pyogrio' engine."
)
kwargs["skip_features"] = rows.start
if rows.stop is not None:
kwargs["max_features"] = rows.stop - (rows.start or 0)
if rows.step is not None:
raise ValueError("slice with step is not supported")
else:
raise TypeError("'rows' must be an integer or a slice.")
if bbox is not None and mask is not None:
# match error message from Fiona
raise ValueError("mask and bbox can not be set together")
if bbox is not None:
if isinstance(bbox, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
bbox = tuple(bbox.to_crs(crs).total_bounds)
elif isinstance(bbox, BaseGeometry):
bbox = bbox.bounds
if len(bbox) != 4:
raise ValueError("'bbox' should be a length-4 tuple.")
if mask is not None:
# NOTE: mask cannot be used at same time as bbox keyword
if isinstance(mask, (GeoDataFrame, GeoSeries)):
crs = pyogrio.read_info(path_or_bytes).get("crs")
if isinstance(path_or_bytes, IOBase):
path_or_bytes.seek(0)
mask = shapely.unary_union(mask.to_crs(crs).geometry.values)
elif isinstance(mask, BaseGeometry):
mask = shapely.unary_union(mask)
elif isinstance(mask, dict) or hasattr(mask, "__geo_interface__"):
# convert GeoJSON to shapely geometry
mask = shapely.geometry.shape(mask)
kwargs["mask"] = mask
if kwargs.pop("ignore_geometry", False):
kwargs["read_geometry"] = False
# translate `ignore_fields`/`include_fields` keyword for back compat with fiona
if "ignore_fields" in kwargs and "include_fields" in kwargs:
raise ValueError("Cannot specify both 'ignore_fields' and 'include_fields'")
elif "ignore_fields" in kwargs:
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'ignore_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
ignore_fields = kwargs.pop("ignore_fields")
fields = pyogrio.read_info(path_or_bytes)["fields"]
include_fields = [col for col in fields if col not in ignore_fields]
kwargs["columns"] = include_fields
elif "include_fields" in kwargs:
# translate `include_fields` keyword for back compat with fiona engine
if kwargs.get("columns", None) is not None:
raise ValueError(
"Cannot specify both 'columns' and 'include_fields' keywords"
)
warnings.warn(
"The 'include_fields' and 'ignore_fields' keywords are deprecated, and "
"will be removed in a future release. You can use the 'columns' keyword "
"instead to select which columns to read.",
DeprecationWarning,
stacklevel=3,
)
kwargs["columns"] = kwargs.pop("include_fields")
return pyogrio.read_dataframe(path_or_bytes, bbox=bbox, **kwargs)
def _detect_driver(path):
"""
Attempt to auto-detect driver based on the extension
"""
try:
# in case the path is a file handle
path = path.name
except AttributeError:
pass
try:
return _EXTENSION_TO_DRIVER[Path(path).suffix.lower()]
except KeyError:
# Assume it is a shapefile folder for now. In the future,
# will likely raise an exception when the expected
# folder writing behavior is more clearly defined.
return "ESRI Shapefile"
def _to_file(
df,
filename,
driver=None,
schema=None,
index=None,
mode="w",
crs=None,
engine=None,
metadata=None,
**kwargs,
):
"""
Write this GeoDataFrame to an OGR data source
A dictionary of supported OGR providers is available via:
>>> import pyogrio
>>> pyogrio.list_drivers() # doctest: +SKIP
Parameters
----------
df : GeoDataFrame to be written
filename : string
File path or file handle to write to. The path may specify a
GDAL VSI scheme.
driver : string, default None
The OGR format driver used to write the vector file.
If not specified, it attempts to infer it from the file extension.
If no extension is specified, it saves ESRI Shapefile to a folder.
schema : dict, default None
If specified, the schema dictionary is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the schema based on each column's dtype.
Not supported for the "pyogrio" engine.
index : bool, default None
If True, write index into one or more columns (for MultiIndex).
Default None writes the index into one or more columns only if
the index is named, is a MultiIndex, or has a non-integer data
type. If False, no index is written.
.. versionadded:: 0.7
Previously the index was not written.
mode : string, default 'w'
The write mode, 'w' to overwrite the existing file and 'a' to append;
when using the pyogrio engine, you can also pass ``append=True``.
Not all drivers support appending. For the fiona engine, the drivers
that support appending are listed in fiona.supported_drivers or
https://github.com/Toblerity/Fiona/blob/master/fiona/drvsupport.py.
For the pyogrio engine, you should be able to use any driver that
is available in your installation of GDAL that supports append
capability; see the specific driver entry at
https://gdal.org/drivers/vector/index.html for more information.
crs : pyproj.CRS, default None
If specified, the CRS is passed to Fiona to
better control how the file is written. If None, GeoPandas
will determine the crs based on crs df attribute.
The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
engine : str, "pyogrio" or "fiona"
The underlying library that is used to read the file. Currently, the
supported options are "pyogrio" and "fiona". Defaults to "pyogrio" if
installed, otherwise tries "fiona". Engine can also be set globally
with the ``geopandas.options.io_engine`` option.
metadata : dict[str, str], default None
Optional metadata to be stored in the file. Keys and values must be
strings. Only supported for the "GPKG" driver
(requires Fiona >= 1.9 or pyogrio >= 0.6).
**kwargs :
Keyword args to be passed to the engine, and can be used to write
to multi-layer data, store data within archives (zip files), etc.
In case of the "fiona" engine, the keyword arguments are passed to
fiona.open`. For more information on possible keywords, type:
``import fiona; help(fiona.open)``. In case of the "pyogrio" engine,
the keyword arguments are passed to `pyogrio.write_dataframe`.
Notes
-----
The format drivers will attempt to detect the encoding of your data, but
may fail. In this case, the proper encoding can be specified explicitly
by using the encoding keyword parameter, e.g. ``encoding='utf-8'``.
"""
engine = _check_engine(engine, "'to_file' method")
filename = _expand_user(filename)
if index is None:
# Determine if index attribute(s) should be saved to file
# (only if they are named or are non-integer)
index = list(df.index.names) != [None] or not is_integer_dtype(df.index.dtype)
if index:
df = df.reset_index(drop=False)
if driver is None:
driver = _detect_driver(filename)
if driver == "ESRI Shapefile" and any(len(c) > 10 for c in df.columns.tolist()):
warnings.warn(
"Column names longer than 10 characters will be truncated when saved to "
"ESRI Shapefile.",
stacklevel=3,
)
if (df.dtypes == "geometry").sum() > 1:
raise ValueError(
"GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file "
"supports only a single geometry column. Use a GeoDataFrame.to_parquet or "
"GeoDataFrame.to_feather, drop additional geometry columns or convert them "
"to a supported format like a well-known text (WKT) using "
"`GeoSeries.to_wkt()`.",
)
_check_metadata_supported(metadata, engine, driver)
if mode not in ("w", "a"):
raise ValueError(f"'mode' should be one of 'w' or 'a', got '{mode}' instead")
if engine == "pyogrio":
_to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs)
elif engine == "fiona":
_to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs)
else:
raise ValueError(f"unknown engine '{engine}'")
def _to_file_fiona(df, filename, driver, schema, crs, mode, metadata, **kwargs):
if not HAS_PYPROJ and crs:
raise ImportError(
"The 'pyproj' package is required to write a file with a CRS, but it is not"
" installed or does not import correctly."
)
if schema is None:
schema = infer_schema(df)
if crs:
from pyproj import CRS
crs = CRS.from_user_input(crs)
else:
crs = df.crs
with fiona_env():
crs_wkt = None
try:
gdal_version = Version(
fiona.env.get_gdal_release_name().strip("e")
) # GH3147
except (AttributeError, ValueError):
gdal_version = Version("2.0.0") # just assume it is not the latest
if gdal_version >= Version("3.0.0") and crs:
crs_wkt = crs.to_wkt()
elif crs:
crs_wkt = crs.to_wkt("WKT1_GDAL")
with fiona.open(
filename, mode=mode, driver=driver, crs_wkt=crs_wkt, schema=schema, **kwargs
) as colxn:
if metadata is not None:
colxn.update_tags(metadata)
colxn.writerecords(df.iterfeatures())
def _to_file_pyogrio(df, filename, driver, schema, crs, mode, metadata, **kwargs):
import pyogrio
if schema is not None:
raise ValueError(
"The 'schema' argument is not supported with the 'pyogrio' engine."
)
if mode == "a":
kwargs["append"] = True
if crs is not None:
raise ValueError("Passing 'crs' is not supported with the 'pyogrio' engine.")
# for the fiona engine, this check is done in gdf.iterfeatures()
if not df.columns.is_unique:
raise ValueError("GeoDataFrame cannot contain duplicated column names.")
pyogrio.write_dataframe(df, filename, driver=driver, metadata=metadata, **kwargs)
def infer_schema(df):
from collections import OrderedDict
# TODO: test pandas string type and boolean type once released
types = {
"Int32": "int32",
"int32": "int32",
"Int64": "int",
"string": "str",
"boolean": "bool",
}
def convert_type(column, in_type):
if in_type == object:
return "str"
if in_type.name.startswith("datetime64"):
# numpy datetime type regardless of frequency
return "datetime"
if str(in_type) in types:
out_type = types[str(in_type)]
else:
out_type = type(np.zeros(1, in_type).item()).__name__
if out_type == "long":
out_type = "int"
return out_type
properties = OrderedDict(
[
(col, convert_type(col, _type))
for col, _type in zip(df.columns, df.dtypes)
if col != df._geometry_column_name
]
)
if df.empty:
warnings.warn(
"You are attempting to write an empty DataFrame to file. "
"For some drivers, this operation may fail.",
UserWarning,
stacklevel=3,
)
# Since https://github.com/Toblerity/Fiona/issues/446 resolution,
# Fiona allows a list of geometry types
geom_types = _geometry_types(df)
schema = {"geometry": geom_types, "properties": properties}
return schema
def _geometry_types(df):
"""
Determine the geometry types in the GeoDataFrame for the schema.
"""
geom_types_2D = df[~df.geometry.has_z].geometry.geom_type.unique()
geom_types_2D = [gtype for gtype in geom_types_2D if gtype is not None]
geom_types_3D = df[df.geometry.has_z].geometry.geom_type.unique()
geom_types_3D = ["3D " + gtype for gtype in geom_types_3D if gtype is not None]
geom_types = geom_types_3D + geom_types_2D
if len(geom_types) == 0:
# Default geometry type supported by Fiona
# (Since https://github.com/Toblerity/Fiona/issues/446 resolution)
return "Unknown"
if len(geom_types) == 1:
geom_types = geom_types[0]
return geom_types
def _list_layers(filename) -> pd.DataFrame:
"""List layers available in a file.
Provides an overview of layers available in a file or URL together with their
geometry types. When supported by the data source, this includes both spatial and
non-spatial layers. Non-spatial layers are indicated by the ``"geometry_type"``
column being ``None``. GeoPandas will not read such layers but they can be read into
a pd.DataFrame using :func:`pyogrio.read_dataframe`.
Parameters
----------
filename : str, path object or file-like object
Either the absolute or relative path to the file or URL to
be opened, or any object with a read() method (such as an open file
or StringIO)
Returns
-------
pandas.DataFrame
A DataFrame with columns "name" and "geometry_type" and one row per layer.
"""
_import_pyogrio()
_check_pyogrio("list_layers")
import pyogrio
return pd.DataFrame(
pyogrio.list_layers(filename), columns=["name", "geometry_type"]
)

View File

@@ -0,0 +1,473 @@
import warnings
from contextlib import contextmanager
from functools import lru_cache
import pandas as pd
import shapely
import shapely.wkb
from geopandas import GeoDataFrame
@contextmanager
def _get_conn(conn_or_engine):
"""
Yield a connection within a transaction context.
Engine.begin() returns a Connection with an implicit Transaction while
Connection.begin() returns the Transaction. This helper will always return a
Connection with an implicit (possibly nested) Transaction.
Parameters
----------
conn_or_engine : Connection or Engine
A sqlalchemy Connection or Engine instance
Returns
-------
Connection
"""
from sqlalchemy.engine.base import Connection, Engine
if isinstance(conn_or_engine, Connection):
if not conn_or_engine.in_transaction():
with conn_or_engine.begin():
yield conn_or_engine
else:
yield conn_or_engine
elif isinstance(conn_or_engine, Engine):
with conn_or_engine.begin() as conn:
yield conn
else:
raise ValueError(f"Unknown Connectable: {conn_or_engine}")
def _df_to_geodf(df, geom_col="geom", crs=None, con=None):
"""
Transforms a pandas DataFrame into a GeoDataFrame.
The column 'geom_col' must be a geometry column in WKB representation.
To be used to convert df based on pd.read_sql to gdf.
Parameters
----------
df : DataFrame
pandas DataFrame with geometry column in WKB representation.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : pyproj.CRS, optional
CRS to use for the returned GeoDataFrame. The value can be anything accepted
by :meth:`pyproj.CRS.from_user_input() <pyproj.crs.CRS.from_user_input>`,
such as an authority string (eg "EPSG:4326") or a WKT string.
If not set, tries to determine CRS from the SRID associated with the
first geometry in the database, and assigns that to all geometries.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
Returns
-------
GeoDataFrame
"""
if geom_col not in df:
raise ValueError("Query missing geometry column '{}'".format(geom_col))
if df.columns.to_list().count(geom_col) > 1:
raise ValueError(
f"Duplicate geometry column '{geom_col}' detected in SQL query output. Only"
"one geometry column is allowed."
)
geoms = df[geom_col].dropna()
if not geoms.empty:
load_geom_bytes = shapely.wkb.loads
"""Load from Python 3 binary."""
def load_geom_text(x):
"""Load from binary encoded as text."""
return shapely.wkb.loads(str(x), hex=True)
if isinstance(geoms.iat[0], bytes):
load_geom = load_geom_bytes
else:
load_geom = load_geom_text
df[geom_col] = geoms = geoms.apply(load_geom)
if crs is None:
srid = shapely.get_srid(geoms.iat[0])
# if no defined SRID in geodatabase, returns SRID of 0
if srid != 0:
try:
spatial_ref_sys_df = _get_spatial_ref_sys_df(con, srid)
except pd.errors.DatabaseError:
warning_msg = (
f"Could not find the spatial reference system table "
f"(spatial_ref_sys) in PostGIS."
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
else:
if not spatial_ref_sys_df.empty:
auth_name = spatial_ref_sys_df["auth_name"].item()
crs = f"{auth_name}:{srid}"
else:
warning_msg = (
f"Could not find srid {srid} in the "
f"spatial_ref_sys table. "
f"Trying epsg:{srid} as a fallback."
)
warnings.warn(warning_msg, UserWarning, stacklevel=3)
crs = "epsg:{}".format(srid)
return GeoDataFrame(df, crs=crs, geometry=geom_col)
def _read_postgis(
sql,
con,
geom_col="geom",
crs=None,
index_col=None,
coerce_float=True,
parse_dates=None,
params=None,
chunksize=None,
):
"""
Returns a GeoDataFrame corresponding to the result of the query
string, which must contain a geometry column in WKB representation.
It is also possible to use :meth:`~GeoDataFrame.read_file` to read from a database.
Especially for file geodatabases like GeoPackage or SpatiaLite this can be easier.
Parameters
----------
sql : string
SQL query to execute in selecting entries from database, or name
of the table to read from the database.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the database to query.
geom_col : string, default 'geom'
column name to convert to shapely geometries
crs : dict or str, optional
CRS to use for the returned GeoDataFrame; if not set, tries to
determine CRS from the SRID associated with the first geometry in
the database, and assigns that to all geometries.
chunksize : int, default None
If specified, return an iterator where chunksize is the number of rows to
include in each chunk.
See the documentation for pandas.read_sql for further explanation
of the following parameters:
index_col, coerce_float, parse_dates, params, chunksize
Returns
-------
GeoDataFrame
Examples
--------
PostGIS
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> db_connection_url = "postgresql://myusername:mypassword@myhost:5432/mydatabase"
>>> con = create_engine(db_connection_url) # doctest: +SKIP
>>> sql = "SELECT geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
SpatiaLite
>>> sql = "SELECT ST_AsBinary(geom) AS geom, highway FROM roads"
>>> df = geopandas.read_postgis(sql, con) # doctest: +SKIP
"""
if chunksize is None:
# read all in one chunk and return a single GeoDataFrame
df = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return _df_to_geodf(df, geom_col=geom_col, crs=crs, con=con)
else:
# read data in chunks and return a generator
df_generator = pd.read_sql(
sql,
con,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
params=params,
chunksize=chunksize,
)
return (
_df_to_geodf(df, geom_col=geom_col, crs=crs, con=con) for df in df_generator
)
def _get_geometry_type(gdf):
"""
Get basic geometry type of a GeoDataFrame. See more info from:
https://geoalchemy-2.readthedocs.io/en/latest/types.html#geoalchemy2.types._GISType
Following rules apply:
- if geometries all share the same geometry-type,
geometries are inserted with the given GeometryType with following types:
- Point, LineString, Polygon, MultiPoint, MultiLineString, MultiPolygon,
GeometryCollection.
- LinearRing geometries will be converted into LineString -objects.
- in all other cases, geometries will be inserted with type GEOMETRY:
- a mix of Polygons and MultiPolygons in GeoSeries
- a mix of Points and LineStrings in GeoSeries
- geometry is of type GeometryCollection,
such as GeometryCollection([Point, LineStrings])
- if any of the geometries has Z-coordinate, all records will
be written with 3D.
"""
geom_types = list(gdf.geometry.geom_type.unique())
has_curve = False
for gt in geom_types:
if gt is None:
continue
elif "LinearRing" in gt:
has_curve = True
if len(geom_types) == 1:
if has_curve:
target_geom_type = "LINESTRING"
else:
if geom_types[0] is None:
raise ValueError("No valid geometries in the data.")
else:
target_geom_type = geom_types[0].upper()
else:
target_geom_type = "GEOMETRY"
# Check for 3D-coordinates
if any(gdf.geometry.has_z):
target_geom_type += "Z"
return target_geom_type, has_curve
def _get_srid_from_crs(gdf):
"""
Get EPSG code from CRS if available. If not, return 0.
"""
# Use geoalchemy2 default for srid
# Note: undefined srid in PostGIS is 0
srid = None
warning_msg = (
"Could not parse CRS from the GeoDataFrame. "
"Inserting data without defined CRS."
)
if gdf.crs is not None:
try:
for confidence in (100, 70, 25):
srid = gdf.crs.to_epsg(min_confidence=confidence)
if srid is not None:
break
auth_srid = gdf.crs.to_authority(
auth_name="ESRI", min_confidence=confidence
)
if auth_srid is not None:
srid = int(auth_srid[1])
break
except Exception:
warnings.warn(warning_msg, UserWarning, stacklevel=2)
if srid is None:
srid = 0
warnings.warn(warning_msg, UserWarning, stacklevel=2)
return srid
def _convert_linearring_to_linestring(gdf, geom_name):
from shapely.geometry import LineString
# Todo: Use shapely function once it's implemented:
# https://github.com/shapely/shapely/issues/1617
mask = gdf.geom_type == "LinearRing"
gdf.loc[mask, geom_name] = gdf.loc[mask, geom_name].apply(
lambda geom: LineString(geom)
)
return gdf
def _convert_to_ewkb(gdf, geom_name, srid):
"""Convert geometries to ewkb."""
geoms = shapely.to_wkb(
shapely.set_srid(gdf[geom_name].values._data, srid=srid),
hex=True,
include_srid=True,
)
# The gdf will warn that the geometry column doesn't hold in-memory geometries
# now that they are EWKB, so convert back to a regular dataframe to avoid warning
# the user that the dtypes are unexpected.
df = pd.DataFrame(gdf, copy=False)
df[geom_name] = geoms
return df
def _psql_insert_copy(tbl, conn, keys, data_iter):
import csv
import io
s_buf = io.StringIO()
writer = csv.writer(s_buf)
writer.writerows(data_iter)
s_buf.seek(0)
columns = ", ".join('"{}"'.format(k) for k in keys)
dbapi_conn = conn.connection
sql = 'COPY "{}"."{}" ({}) FROM STDIN WITH CSV'.format(
tbl.table.schema, tbl.table.name, columns
)
with dbapi_conn.cursor() as cur:
# Use psycopg method if it's available
if hasattr(cur, "copy") and callable(cur.copy):
with cur.copy(sql) as copy:
copy.write(s_buf.read())
else: # otherwise use psycopg2 method
cur.copy_expert(sql, s_buf)
def _write_postgis(
gdf,
name,
con,
schema=None,
if_exists="fail",
index=False,
index_label=None,
chunksize=None,
dtype=None,
):
"""
Upload GeoDataFrame into PostGIS database.
This method requires SQLAlchemy and GeoAlchemy2, and a PostgreSQL
Python driver (e.g. psycopg2) to be installed.
Parameters
----------
name : str
Name of the target table.
con : sqlalchemy.engine.Connection or sqlalchemy.engine.Engine
Active connection to the PostGIS database.
if_exists : {'fail', 'replace', 'append'}, default 'fail'
How to behave if the table already exists:
- fail: Raise a ValueError.
- replace: Drop the table before inserting new values.
- append: Insert new values to the existing table.
schema : string, optional
Specify the schema. If None, use default schema: 'public'.
index : bool, default True
Write DataFrame index as a column.
Uses *index_label* as the column name in the table.
index_label : string or sequence, default None
Column label for index column(s).
If None is given (default) and index is True,
then the index names are used.
chunksize : int, optional
Rows will be written in batches of this size at a time.
By default, all rows will be written at once.
dtype : dict of column name to SQL type, default None
Specifying the datatype for columns.
The keys should be the column names and the values
should be the SQLAlchemy types.
Examples
--------
>>> from sqlalchemy import create_engine # doctest: +SKIP
>>> engine = create_engine("postgresql://myusername:mypassword@myhost:5432\
/mydatabase";) # doctest: +SKIP
>>> gdf.to_postgis("my_table", engine) # doctest: +SKIP
"""
try:
from geoalchemy2 import Geometry
from sqlalchemy import text
except ImportError:
raise ImportError("'to_postgis()' requires geoalchemy2 package.")
gdf = gdf.copy()
geom_name = gdf.geometry.name
# Get srid
srid = _get_srid_from_crs(gdf)
# Get geometry type and info whether data contains LinearRing.
geometry_type, has_curve = _get_geometry_type(gdf)
# Build dtype with Geometry
if dtype is not None:
dtype[geom_name] = Geometry(geometry_type=geometry_type, srid=srid)
else:
dtype = {geom_name: Geometry(geometry_type=geometry_type, srid=srid)}
# Convert LinearRing geometries to LineString
if has_curve:
gdf = _convert_linearring_to_linestring(gdf, geom_name)
# Convert geometries to EWKB
gdf = _convert_to_ewkb(gdf, geom_name, srid)
if schema is not None:
schema_name = schema
else:
schema_name = "public"
if if_exists == "append":
# Check that the geometry srid matches with the current GeoDataFrame
with _get_conn(con) as connection:
# Only check SRID if table exists
if connection.dialect.has_table(connection, name, schema):
target_srid = connection.execute(
text(
"SELECT Find_SRID('{schema}', '{table}', '{geom_col}');".format(
schema=schema_name, table=name, geom_col=geom_name
)
)
).fetchone()[0]
if target_srid != srid:
msg = (
"The CRS of the target table (EPSG:{epsg_t}) differs from the "
"CRS of current GeoDataFrame (EPSG:{epsg_src}).".format(
epsg_t=target_srid, epsg_src=srid
)
)
raise ValueError(msg)
with _get_conn(con) as connection:
gdf.to_sql(
name,
connection,
schema=schema_name,
if_exists=if_exists,
index=index,
index_label=index_label,
chunksize=chunksize,
dtype=dtype,
method=_psql_insert_copy,
)
@lru_cache
def _get_spatial_ref_sys_df(con, srid):
spatial_ref_sys_sql = (
f"SELECT srid, auth_name FROM spatial_ref_sys WHERE srid = {srid}"
)
return pd.read_sql(spatial_ref_sys_sql, con)

Some files were not shown because too many files have changed in this diff Show More