Toblerity · snorfalorpagus · Jan 21, 2018 · Jan 21, 2018 · Jan 22, 2018 · Jan 22, 2018
diff --git a/.gitignore b/.gitignore
@@ -76,6 +76,7 @@ fiona/_shim2.c
 fiona/_shim22.c
 fiona/_shim.pxd
 fiona/_shim.pyx
+fiona/_vectorized.c
 tests/data/coutwildrnp.json
 tests/data/coutwildrnp.tar
 tests/data/coutwildrnp.zip

diff --git a/fiona/_vectorized.pyx b/fiona/_vectorized.pyx
@@ -0,0 +1,188 @@
+from .ogrext cimport Session, _deleteOgrFeature
+from .ogrext import FIELD_TYPES, FIELD_TYPES_MAP, OGRERR_NONE
+from ._shim cimport *
+from libc.stdlib cimport malloc, free
+from fiona.rfc3339 import FionaDateType, FionaDateTimeType, FionaTimeType
+
+import logging
+from six import text_type
+import datetime
+
+import numpy as np
+cimport numpy as np
+
+log = logging.getLogger(__name__)
+
+def read_vectorized(collection, use_wkb=False):
+    cdef Session session
+    cdef void * cogr_feature
+    cdef void * cogr_geometry
+    cdef int num_fields
+    cdef void * fdefn
+    cdef int feature_index
+    cdef int field_index
+    cdef char * field_name_c
+    cdef bytes field_name_bytes
+    cdef int i
+    cdef int l
+    cdef int y = 0
+    cdef int m = 0
+    cdef int d = 0
+    cdef int hh = 0
+    cdef int mm = 0
+    cdef int ss = 0
+    cdef int tz = 0
+    cdef long long fid
+    cdef long long [:] arr_int
+    cdef double [:] arr_double
+    cdef char * wkt
+    cdef char * wkb
+
+    session = collection.session
+    encoding = session._fileencoding
+
+    if session.cogr_layer == NULL:
+        raise ValueError("Null layer")
+
+    length = OGR_L_GetFeatureCount(session.cogr_layer, 0)
+
+    data_fids = np.empty([length], dtype=object)
+    data_properties = {}
+
+    if collection.ignore_fields:
+        ignore_fields = set(collection.ignore_fields)
+    else:
+        ignore_fields = set()
+
+    if collection.ignore_geometry:
+        ignore_geometry = True
+        data_geometry = None
+    else:
+        ignore_geometry = False
+        data_geometry = np.empty([length], dtype=object)
+
+    schema = session.get_schema()
+    for field_name, field_type in schema["properties"].items():
+        if field_name in ignore_fields:
+            continue
+        if ":" in field_type:
+            field_type, precision = field_type.split(":")
+        else:
+            precision = None
+        if field_type == "int":
+            data_properties[field_name] = np.empty([length], dtype=np.int64)
+        elif field_type == "float":
+            data_properties[field_name] = np.empty([length], dtype=np.float64)
+        elif field_type == "str":
+            data_properties[field_name] = np.empty([length], dtype=object)
+        elif field_type == "bytes":
+            data_properties[field_name] = np.empty([length], dtype=object)
+        elif field_type == "date":
+            data_properties[field_name] = np.empty([length], dtype='datetime64[D]')
+        elif field_type == "time":
+            # numpy has no dtype for time without date
+            data_properties[field_name] = np.empty([length], dtype=object)
+        elif field_type == "datetime":
+            data_properties[field_name] = np.empty([length], dtype='datetime64[s]')
+        else:
+            raise TypeError("Unexpected field type: {}".format(field_type))
+
+    OGR_L_ResetReading(session.cogr_layer)
+    for feature_index in range(length):
+        cogr_feature = OGR_L_GetNextFeature(session.cogr_layer)
+
+        if cogr_feature == NULL:
+            raise ValueError("Failed to read feature {}".format(feature_index))
+
+        fid = OGR_F_GetFID(cogr_feature)
+        data_fids[feature_index] = str(fid)
+
+        num_fields = OGR_F_GetFieldCount(cogr_feature)
+        for field_index in range(num_fields):
+            fdefn = OGR_F_GetFieldDefnRef(cogr_feature, field_index)
+
+            # field name
+            field_name_c = OGR_Fld_GetNameRef(fdefn)
+            field_name_bytes = field_name_c
+            field_name = field_name_bytes.decode(encoding)
+            if field_name in ignore_fields:
+                continue
+
+            # field type
+            field_type_id = OGR_Fld_GetType(fdefn)
+            field_type_name = FIELD_TYPES[field_type_id]
+            field_type = FIELD_TYPES_MAP[field_type_name]
+
+            if field_type is int:
+                # TODO: support boolean subtype
+                arr_int = data_properties[field_name]
+                if is_field_null(cogr_feature, field_index):
+                    # TODO: is this the best way to handle NULL values for int?
+                    arr_int[feature_index] = 0
+                else:
+                    arr_int[feature_index] = OGR_F_GetFieldAsInteger64(cogr_feature, field_index)
+            elif field_type is float:
+                arr_double = data_properties[field_name]
+                if is_field_null(cogr_feature, field_index):
+                    arr_double[feature_index] = np.nan
+                else:
+                    arr_double[feature_index] = OGR_F_GetFieldAsDouble(cogr_feature, field_index)
+            elif field_type is text_type:
+                if is_field_null(cogr_feature, field_index):
+                    value = None
+                else:
+                    try:
+                        value = OGR_F_GetFieldAsString(cogr_feature, field_index)
+                        value = value.decode(encoding)
+                    except UnicodeDecodeError:
+                        log.warning(
+                            "Failed to decode %s using %s codec", value, encoding)
+                arr = data_properties[field_name]
+                arr[feature_index] = value
+            elif field_type in (FionaDateType, FionaTimeType, FionaDateTimeType):
+                arr = data_properties[field_name]
+                retval = OGR_F_GetFieldAsDateTime(
+                    cogr_feature, field_index, &y, &m, &d, &hh, &mm, &ss, &tz)
+                if not retval:
+                    arr[feature_index] = None
+                else:
+                    if field_type is FionaDateType:
+                        arr[feature_index] = datetime.date(y, m, d).isoformat()
+                    elif field_type is FionaTimeType:
+                        arr[feature_index] = datetime.time(hh, mm, ss).isoformat()
+                    else:
+                        arr[feature_index] = datetime.datetime(y, m, d, hh, mm, ss).isoformat()
+            elif field_type is bytes:
+                data = OGR_F_GetFieldAsBinary(cogr_feature, field_index, &l)
+                arr = data_properties[field_name]
+                arr[feature_index] = data[:l]
+            else:
+                raise TypeError("Unexpected field type: {}".format(field_type))
+
+        if not ignore_geometry:
+            cogr_geometry = OGR_F_GetGeometryRef(cogr_feature)
+            if cogr_geometry == NULL:
+                data_geometry[feature_index] = None
+            elif use_wkb:
+                length = OGR_G_WkbSize(cogr_geometry)
+                wkb = <char*>malloc(sizeof(char)*length)
+                result = OGR_G_ExportToWkb(cogr_geometry, 1, wkb)
+                if result != OGRERR_NONE:
+                    raise ValueError("Failed to export geometry to WKB")
+                data_geometry[feature_index] = wkb[:length]
+                free(wkb)
+            else:
+                result = OGR_G_ExportToWkt(cogr_geometry, &wkt)
+                if result != OGRERR_NONE:
+                    raise ValueError("Failed to export geometry to WKT")
+                data_geometry[feature_index] = wkt
+
+        _deleteOgrFeature(cogr_feature)
+
+    features = {
+        "id": data_fids,
+        "geometry": data_geometry,
+        "properties": data_properties,
+    }
+
+    return features
diff --git a/fiona/ogrext.pxd b/fiona/ogrext.pxd
@@ -0,0 +1,8 @@
+cdef class Session:
+    cdef void *cogr_ds
+    cdef void *cogr_layer
+    cdef object _fileencoding
+    cdef object _encoding
+    cdef object collection
+
+cdef _deleteOgrFeature(void *cogr_feature)
diff --git a/fiona/ogrext.pyx b/fiona/ogrext.pyx
@@ -397,13 +397,6 @@ def featureRT(feature, collection):
 # Collection-related extension classes and functions
 
 cdef class Session:
-
-    cdef void *cogr_ds
-    cdef void *cogr_layer
-    cdef object _fileencoding
-    cdef object _encoding
-    cdef object collection
-
     def __init__(self):
         self.cogr_ds = NULL
         self.cogr_layer = NULL

diff --git a/fiona/ogrext1.pxd b/fiona/ogrext1.pxd
@@ -135,7 +135,8 @@ cdef extern from "ogr_api.h":
     void *  OGR_G_CreateGeometry (int wkbtypecode)
     void    OGR_G_DestroyGeometry (void *geometry)
     unsigned char *  OGR_G_ExportToJson (void *geometry)
-    void    OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
+    OGRErr  OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
+    OGRErr  OGR_G_ExportToWkt (void *geometry, char **wkt)
     int     OGR_G_GetCoordinateDimension (void *geometry)
     int     OGR_G_GetGeometryCount (void *geometry)
     unsigned char *  OGR_G_GetGeometryName (void *geometry)

diff --git a/fiona/ogrext2.pxd b/fiona/ogrext2.pxd
@@ -196,7 +196,8 @@ cdef extern from "ogr_api.h":
     void *  OGR_G_CreateGeometry (int wkbtypecode)
     void    OGR_G_DestroyGeometry (void *geometry)
     unsigned char *  OGR_G_ExportToJson (void *geometry)
-    void    OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
+    OGRErr  OGR_G_ExportToWkb (void *geometry, int endianness, char *buffer)
+    OGRErr  OGR_G_ExportToWkt (void *geometry, char **wkt)
     int     OGR_G_GetCoordinateDimension (void *geometry)
     int     OGR_G_GetGeometryCount (void *geometry)
     unsigned char *  OGR_G_GetGeometryName (void *geometry)

diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,4 @@ cligj>=0.4
 six>=1.7
 ordereddict
 munch
+numpy
diff --git a/setup.py b/setup.py
@@ -9,6 +9,13 @@
 from setuptools import setup
 from setuptools.extension import Extension
 
+# NumPy is required for vectorized submodule
+try:
+    import numpy as np
+except ImportError:
+    has_numpy = False
+else:
+    has_numpy = True
 
 # Use Cython if available.
 try:
@@ -180,6 +187,9 @@ def run(self):
     libraries=libraries,
     extra_link_args=extra_link_args)
 
+if has_numpy:
+    ext_options["include_dirs"].append(np.get_include())
+
 ext_options_cpp = ext_options.copy()
 # GDAL 2.3+ requires C++11
 if sys.platform == "win32":
@@ -212,14 +222,17 @@ def run(self):
             shutil.copy('fiona/_shim2.pyx', 'fiona/_shim.pyx')
             shutil.copy('fiona/_shim2.pxd', 'fiona/_shim.pxd')
 
-    ext_modules = cythonize([
+    ext_modules = [
         Extension('fiona._geometry', ['fiona/_geometry.pyx'], **ext_options),
         Extension('fiona._transform', ['fiona/_transform.pyx'], **ext_options_cpp),
         Extension('fiona._crs', ['fiona/_crs.pyx'], **ext_options),
         Extension('fiona._drivers', ['fiona/_drivers.pyx'], **ext_options),
         Extension('fiona._err', ['fiona/_err.pyx'], **ext_options),
         Extension('fiona._shim', ['fiona/_shim.pyx'], **ext_options),
-        Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)])
+        Extension('fiona.ogrext', ['fiona/ogrext.pyx'], **ext_options)]
+    if has_numpy:
+        ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.pyx'], **ext_options))
+    ext_modules = cythonize(ext_modules)
 
 # If there's no manifest template, as in an sdist, we just specify .c files.
 elif "clean" not in sys.argv:
@@ -231,6 +244,8 @@ def run(self):
         Extension('fiona._err', ['fiona/_err.c'], **ext_options),
         Extension('fiona.ogrext', ['fiona/ogrext.c'], **ext_options),
     ]
+    if has_numpy:
+        ext_modules.append(Extension('fiona._vectorized', ['fiona/_vectorized.c'], **ext_options))
 
     if gdal_major_version == 1:
         log.info("Building Fiona for gdal 1.x: {0}".format(gdalversion))

diff --git a/tests/test_binary_field.py b/tests/test_binary_field.py
@@ -9,6 +9,33 @@
 from collections import OrderedDict
 from .conftest import requires_gpkg
 
+def write_binary_gpkg(path):
+    meta = {
+        "driver": "GPKG",
+        "schema": {
+            "geometry": "Point",
+            "properties": OrderedDict([
+                ("name", "str"),
+                ("data", "bytes"),
+            ])
+        }
+    }
+
+    # create some binary data to encode
+    data = binascii.a2b_hex(b"deadbeef")
+
+    # write the binary data to a BLOB field
+    with fiona.open(path, "w", **meta) as dst:
+        feature = {
+            "geometry": {"type": "Point", "coordinates": ((0,0))},
+            "properties": {
+                "name": "test",
+                "data": data
+            }
+        }
+        dst.write(feature)
+
+
 class TestBinaryField(unittest.TestCase):
     def setUp(self):
         self.tempdir = tempfile.mkdtemp()
@@ -18,33 +45,9 @@ def tearDown(self):
 
     @requires_gpkg
     def test_binary_field(self):
-        meta = {
-            "driver": "GPKG",
-            "schema": {
-                "geometry": "Point",
-                "properties": OrderedDict([
-                    ("name", "str"),
-                    ("data", "bytes"),
-                ])
-            }
-        }
-
-        # create some binary data to encode
-        data = binascii.a2b_hex(b"deadbeef")
-
-        # write the binary data to a BLOB field
         filename = os.path.join(self.tempdir, "binary_test.gpkg")
-        with fiona.open(filename, "w", **meta) as dst:
-            feature = {
-                "geometry": {"type": "Point", "coordinates": ((0,0))},
-                "properties": {
-                    "name": "test",
-                    "data": data
-                }
-            }
-            dst.write(feature)
-
-        del(data)
+
+        write_binary_gpkg(filename)
 
         # read the data back and check consistency
         with fiona.open(filename, "r") as src: