Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Initial commit of bulk reading into numpy arrays #540

Closed
wants to merge 10 commits into from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ fiona/_shim2.c
fiona/_shim22.c
fiona/_shim.pxd
fiona/_shim.pyx
fiona/_vectorized.c
tests/data/coutwildrnp.json
tests/data/coutwildrnp.tar
tests/data/coutwildrnp.zip
Expand Down
188 changes: 188 additions & 0 deletions fiona/_vectorized.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
from .ogrext cimport Session, _deleteOgrFeature
from .ogrext import FIELD_TYPES, FIELD_TYPES_MAP, OGRERR_NONE
from ._shim cimport *
from libc.stdlib cimport malloc, free
from fiona.rfc3339 import FionaDateType, FionaDateTimeType, FionaTimeType

import logging
from six import text_type
import datetime

import numpy as np
cimport numpy as np

log = logging.getLogger(__name__)

def read_vectorized(collection, use_wkb=False):
cdef Session session
cdef void * cogr_feature
cdef void * cogr_geometry
cdef int num_fields
cdef void * fdefn
cdef int feature_index
cdef int field_index
cdef char * field_name_c
cdef bytes field_name_bytes
cdef int i
cdef int l
cdef int y = 0
cdef int m = 0
cdef int d = 0
cdef int hh = 0
cdef int mm = 0
cdef int ss = 0
cdef int tz = 0
cdef long long fid
cdef long long [:] arr_int
cdef double [:] arr_double
cdef char * wkt
cdef char * wkb

session = collection.session
encoding = session._fileencoding

if session.cogr_layer == NULL:
raise ValueError("Null layer")

length = OGR_L_GetFeatureCount(session.cogr_layer, 0)

data_fids = np.empty([length], dtype=object)
data_properties = {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts on using a structured array for data_properties?


if collection.ignore_fields:
ignore_fields = set(collection.ignore_fields)
else:
ignore_fields = set()

if collection.ignore_geometry:
ignore_geometry = True
data_geometry = None
else:
ignore_geometry = False
data_geometry = np.empty([length], dtype=object)

schema = session.get_schema()
for field_name, field_type in schema["properties"].items():
if field_name in ignore_fields:
continue
if ":" in field_type:
field_type, precision = field_type.split(":")
else:
precision = None
if field_type == "int":
data_properties[field_name] = np.empty([length], dtype=np.int64)
elif field_type == "float":
data_properties[field_name] = np.empty([length], dtype=np.float64)
elif field_type == "str":
data_properties[field_name] = np.empty([length], dtype=object)
elif field_type == "bytes":
data_properties[field_name] = np.empty([length], dtype=object)
elif field_type == "date":
data_properties[field_name] = np.empty([length], dtype='datetime64[D]')
elif field_type == "time":
# numpy has no dtype for time without date
data_properties[field_name] = np.empty([length], dtype=object)
elif field_type == "datetime":
data_properties[field_name] = np.empty([length], dtype='datetime64[s]')
else:
raise TypeError("Unexpected field type: {}".format(field_type))

OGR_L_ResetReading(session.cogr_layer)
for feature_index in range(length):
cogr_feature = OGR_L_GetNextFeature(session.cogr_layer)

if cogr_feature == NULL:
raise ValueError("Failed to read feature {}".format(feature_index))

fid = OGR_F_GetFID(cogr_feature)
data_fids[feature_index] = str(fid)

num_fields = OGR_F_GetFieldCount(cogr_feature)
for field_index in range(num_fields):
fdefn = OGR_F_GetFieldDefnRef(cogr_feature, field_index)

# field name
field_name_c = OGR_Fld_GetNameRef(fdefn)
field_name_bytes = field_name_c
field_name = field_name_bytes.decode(encoding)
if field_name in ignore_fields:
continue

# field type
field_type_id = OGR_Fld_GetType(fdefn)
field_type_name = FIELD_TYPES[field_type_id]
field_type = FIELD_TYPES_MAP[field_type_name]

if field_type is int:
# TODO: support boolean subtype
arr_int = data_properties[field_name]
if is_field_null(cogr_feature, field_index):
# TODO: is this the best way to handle NULL values for int?
arr_int[feature_index] = 0
else:
arr_int[feature_index] = OGR_F_GetFieldAsInteger64(cogr_feature, field_index)
elif field_type is float:
arr_double = data_properties[field_name]
if is_field_null(cogr_feature, field_index):
arr_double[feature_index] = np.nan
else:
arr_double[feature_index] = OGR_F_GetFieldAsDouble(cogr_feature, field_index)
elif field_type is text_type:
if is_field_null(cogr_feature, field_index):
value = None
else:
try:
value = OGR_F_GetFieldAsString(cogr_feature, field_index)
value = value.decode(encoding)
except UnicodeDecodeError:
log.warning(
"Failed to decode %s using %s codec", value, encoding)
arr = data_properties[field_name]
arr[feature_index] = value
elif field_type in (FionaDateType, FionaTimeType, FionaDateTimeType):
arr = data_properties[field_name]
retval = OGR_F_GetFieldAsDateTime(
cogr_feature, field_index, &y, &m, &d, &hh, &mm, &ss, &tz)
if not retval:
arr[feature_index] = None
else:
if field_type is FionaDateType:
arr[feature_index] = datetime.date(y, m, d).isoformat()
elif field_type is FionaTimeType:
arr[feature_index] = datetime.time(hh, mm, ss).isoformat()
else:
arr[feature_index] = datetime.datetime(y, m, d, hh, mm, ss).isoformat()
elif field_type is bytes:
data = OGR_F_GetFieldAsBinary(cogr_feature, field_index, &l)
arr = data_properties[field_name]
arr[feature_index] = data[:l]
else:
raise TypeError("Unexpected field type: {}".format(field_type))

if not ignore_geometry:
cogr_geometry = OGR_F_GetGeometryRef(cogr_feature)
if cogr_geometry == NULL:
data_geometry[feature_index] = None
elif use_wkb:
length = OGR_G_WkbSize(cogr_geometry)
wkb = <char*>malloc(sizeof(char)*length)
result = OGR_G_ExportToWkb(cogr_geometry, 1, wkb)
if result != OGRERR_NONE:
raise ValueError("Failed to export geometry to WKB")
data_geometry[feature_index] = wkb[:length]
free(wkb)
else:
result = OGR_G_ExportToWkt(cogr_geometry, &wkt)
if result != OGRERR_NONE:
raise ValueError("Failed to export geometry to WKT")
data_geometry[feature_index] = wkt

_deleteOgrFeature(cogr_feature)

features = {
"id": data_fids,
"geometry": data_geometry,
"properties": data_properties,
}

return features
8 changes: 8 additions & 0 deletions fiona/ogrext.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
cdef class Session:
cdef void *cogr_ds
cdef void *cogr_layer
cdef object _fileencoding
cdef object _encoding
cdef object collection

cdef _deleteOgrFeature(void *cogr_feature)
7 changes: 0 additions & 7 deletions fiona/ogrext.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,6 @@ def featureRT(feature, collection):
# Collection-related extension classes and functions

cdef class Session:

cdef void *cogr_ds
cdef void *cogr_layer
cdef object _fileencoding
cdef object _encoding
cdef object collection

def __init__(self):
self.cogr_ds = NULL
self.cogr_layer = NULL
Expand Down
3 changes: 2 additions & 1 deletion fiona/ogrext1.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ cdef extern from "ogr_api.h":
void * OGR_G_CreateGeometry (int wkbtypecode)
void OGR_G_DestroyGeometry (void *geometry)
unsigned char * OGR_G_ExportToJson (void *geometry)
void OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
OGRErr OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
OGRErr OGR_G_ExportToWkt (void *geometry, char **wkt)
int OGR_G_GetCoordinateDimension (void *geometry)
int OGR_G_GetGeometryCount (void *geometry)
unsigned char * OGR_G_GetGeometryName (void *geometry)
Expand Down
3 changes: 2 additions & 1 deletion fiona/ogrext2.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ cdef extern from "ogr_api.h":
void * OGR_G_CreateGeometry (int wkbtypecode)
void OGR_G_DestroyGeometry (void *geometry)
unsigned char * OGR_G_ExportToJson (void *geometry)
void OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
OGRErr OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
OGRErr OGR_G_ExportToWkt (void *geometry, char **wkt)
int OGR_G_GetCoordinateDimension (void *geometry)
int OGR_G_GetGeometryCount (void *geometry)
unsigned char * OGR_G_GetGeometryName (void *geometry)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ cligj>=0.4
six>=1.7
ordereddict
munch
numpy
19 changes: 17 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
from setuptools import setup
from setuptools.extension import Extension

# NumPy is required for vectorized submodule
try:
import numpy as np
except ImportError:
has_numpy = False
else:
has_numpy = True

# Use Cython if available.
try:
Expand Down Expand Up @@ -180,6 +187,9 @@ def run(self):
libraries=libraries,
extra_link_args=extra_link_args)

if has_numpy:
ext_options["include_dirs"].append(np.get_include())

ext_options_cpp = ext_options.copy()
# GDAL 2.3+ requires C++11
if sys.platform == "win32":
Expand Down Expand Up @@ -212,14 +222,17 @@ def run(self):
shutil.copy('fiona/_shim2.pyx', 'fiona/_shim.pyx')
shutil.copy('fiona/_shim2.pxd', 'fiona/_shim.pxd')

ext_modules = cythonize([
ext_modules = [
Extension('fiona._geometry', ['fiona/_geometry.pyx'], **ext_options),
Extension('fiona._transform', ['fiona/_transform.pyx'], **ext_options_cpp),
Extension('fiona._crs', ['fiona/_crs.pyx'], **ext_options),
Extension('fiona._drivers', ['fiona/_drivers.pyx'], **ext_options),
Extension('fiona._err', ['fiona/_err.pyx'], **ext_options),
Extension('fiona._shim', ['fiona/_shim.pyx'], **ext_options),
Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)])
Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)]
if has_numpy:
ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.pyx'], **ext_options))
ext_modules = cythonize(ext_modules)

# If there's no manifest template, as in an sdist, we just specify .c files.
elif "clean" not in sys.argv:
Expand All @@ -231,6 +244,8 @@ def run(self):
Extension('fiona._err', ['fiona/_err.c'], **ext_options),
Extension('fiona.ogrext', ['fiona/ogrext.c'], **ext_options),
]
if has_numpy:
ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.c'], **ext_options))

if gdal_major_version == 1:
log.info("Building Fiona for gdal 1.x: {0}".format(gdalversion))
Expand Down
55 changes: 29 additions & 26 deletions tests/test_binary_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,33 @@
from collections import OrderedDict
from .conftest import requires_gpkg

def write_binary_gpkg(path):
meta = {
"driver": "GPKG",
"schema": {
"geometry": "Point",
"properties": OrderedDict([
("name", "str"),
("data", "bytes"),
])
}
}

# create some binary data to encode
data = binascii.a2b_hex(b"deadbeef")

# write the binary data to a BLOB field
with fiona.open(path, "w", **meta) as dst:
feature = {
"geometry": {"type": "Point", "coordinates": ((0,0))},
"properties": {
"name": "test",
"data": data
}
}
dst.write(feature)


class TestBinaryField(unittest.TestCase):
def setUp(self):
self.tempdir = tempfile.mkdtemp()
Expand All @@ -18,33 +45,9 @@ def tearDown(self):

@requires_gpkg
def test_binary_field(self):
meta = {
"driver": "GPKG",
"schema": {
"geometry": "Point",
"properties": OrderedDict([
("name", "str"),
("data", "bytes"),
])
}
}

# create some binary data to encode
data = binascii.a2b_hex(b"deadbeef")

# write the binary data to a BLOB field
filename = os.path.join(self.tempdir, "binary_test.gpkg")
with fiona.open(filename, "w", **meta) as dst:
feature = {
"geometry": {"type": "Point", "coordinates": ((0,0))},
"properties": {
"name": "test",
"data": data
}
}
dst.write(feature)

del(data)

write_binary_gpkg(filename)

# read the data back and check consistency
with fiona.open(filename, "r") as src:
Expand Down
Loading