Add support for loading columns of object arrays

chrisburr · chrisburr · commit 5a20a5204ae8 · 2018-02-03T15:15:51.000+01:00
diff --git a/root_pandas/readwrite.py b/root_pandas/readwrite.py
@@ -92,6 +92,30 @@ def filter_noexpand_columns(columns):
     return other, noexpand
 
 
+def do_flatten(arr, flatten):
+    if flatten is True:
+        warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like "
+                      "to flatten in a list: flatten=['foo', 'bar']", FutureWarning)
+        arr_, idx = stretch(arr, return_indices=True)
+    else:
+        nonscalar = get_nonscalar_columns(arr)
+        fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)]
+
+        for col in flatten:
+            if col in nonscalar:
+                pass
+            elif col in fields:
+                raise ValueError("Requested to flatten {col} but it has a scalar type"
+                                 .format(col=col))
+            else:
+                raise ValueError("Requested to flatten {col} but it wasn't loaded from the input file"
+                                 .format(col=col))
+
+        arr_, idx = stretch(arr, fields=fields, return_indices=True)
+    arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
+    return arr
+
+
 def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs):
     """
     Read a ROOT file, or list of ROOT files, into a pandas DataFrame.
@@ -174,22 +198,6 @@ def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=
         for var in ignored:
             all_vars.remove(var)
 
-    def do_flatten(arr, flatten):
-        if flatten is True:
-            warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like "
-                          "to flatten in a list: flatten=['foo', 'bar']", FutureWarning)
-            arr_, idx = stretch(arr, return_indices=True)
-        else:
-            nonscalar = get_nonscalar_columns(arr)
-            fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)]
-            will_drop = [x for x in arr.dtype.names if x not in fields]
-            if will_drop:
-                warnings.warn("Ignored the following non-scalar branches: {bad_names}"
-                      .format(bad_names=", ".join(will_drop)), UserWarning)
-            arr_, idx = stretch(arr, fields=fields, return_indices=True)
-        arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
-        return arr
-
     if chunksize:
         tchain = ROOT.TChain(key)
         for path in paths:
@@ -215,26 +223,45 @@ def genchunks():
 
 def convert_to_dataframe(array, start_index=None):
     nonscalar_columns = get_nonscalar_columns(array)
-    if nonscalar_columns:
-        warnings.warn("Ignored the following non-scalar branches: {bad_names}"
-                      .format(bad_names=", ".join(nonscalar_columns)), UserWarning)
-    indices = list(filter(lambda x: x.startswith('__index__') and x not in nonscalar_columns, array.dtype.names))
+
+    # Columns containing 2D arrays can't be loaded so convert them 1D arrays of arrays
+    reshaped_columns = {}
+    for col in nonscalar_columns:
+        if array[col].ndim >= 2:
+            reshaped = np.zeros(len(array[col]), dtype='O')
+            for i, row in enumerate(array[col]):
+                reshaped[i] = row
+            reshaped_columns[col] = reshaped
+
+    indices = list(filter(lambda x: x.startswith('__index__'), array.dtype.names))
     if len(indices) == 0:
         index = None
         if start_index is not None:
             index = RangeIndex(start=start_index, stop=start_index + len(array))
-        df = DataFrame.from_records(array, exclude=nonscalar_columns, index=index)
+        df = DataFrame.from_records(array, exclude=reshaped_columns, index=index)
     elif len(indices) == 1:
         # We store the index under the __index__* branch, where
         # * is the name of the index
-        df = DataFrame.from_records(array, index=indices[0], exclude=nonscalar_columns)
+        df = DataFrame.from_records(array, exclude=reshaped_columns, index=indices[0])
         index_name = indices[0][len('__index__'):]
         if not index_name:
             # None means the index has no name
             index_name = None
         df.index.name = index_name
     else:
         raise ValueError("More than one index found in file")
+
+    # Manually the columns which were reshaped
+    for key, reshaped in reshaped_columns.items():
+        df[key] = reshaped
+
+    # Reshaping can cause the order of columns to change so we have to change it back
+    if reshaped_columns:
+        # Filter to remove __index__ columns
+        columns = [c for c in array.dtype.names if c in df.columns]
+        assert len(columns) == len(df.columns), (columns, df.columns)
+        df = df.reindex_axis(columns, axis=1, copy=False)
+
     return df
 
 
diff --git a/tests/test.py b/tests/test.py
@@ -191,10 +191,17 @@ def test_flatten():
     os.remove('tmp.root')
 
 
-def test_drop_nonscalar_columns():
-    array = np.array([1, 2, 3])
-    matrix = np.array([[1, 2, 3], [4, 5, 6]])
-    bool_matrix = np.array([[True, False, True], [True, True, True]])
+def to_object_array(array):
+    new_array = np.zeros(len(array), dtype='O')
+    for i, row in enumerate(array):
+        new_array[i] = row
+    return new_array
+
+
+def test_nonscalar_columns():
+    array = np.array([1, 2, 3], dtype=np.int64)
+    matrix = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+    bool_matrix = np.array([[True, False, True], [True, True, True]], dtype=np.bool_)
 
     dt = np.dtype([
         ('a', 'i4'),
@@ -208,18 +215,17 @@ def test_drop_nonscalar_columns():
         (2, array, matrix, False, bool_matrix)],
         dtype=dt)
 
+    reference_df = pd.DataFrame()
+    reference_df['a'] = np.array([3, 2], dtype=np.int32)
+    reference_df['b'] = to_object_array([array, array])
+    reference_df['c'] = to_object_array([matrix, matrix])
+    reference_df['d'] = np.array([True, False], dtype=np.bool_)
+    reference_df['e'] = to_object_array([bool_matrix, bool_matrix])
+
     path = 'tmp.root'
     array2root(arr, path, 'ntuple', mode='recreate')
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        df = read_root(path, flatten=False)
-        # the above line throws an error if flatten=True because nonscalar columns
-        # are dropped only after the flattening is applied. However, the flattening
-        # algorithm can not deal with arrays of more than one dimension.
-    assert(len(df.columns) == 2)
-    assert(np.all(df.index.values == np.array([0, 1])))
-    assert(np.all(df.a.values == np.array([3, 2])))
-    assert(np.all(df.d.values == np.array([True, False])))
+    df = read_root(path, flatten=False)
+    assert_frame_equal(df, reference_df)
 
     os.remove(path)