Skip to content
This repository was archived by the owner on Jan 9, 2023. It is now read-only.

Commit 4ab672f

Browse files
authored
Merge pull request #54 from chrisburr/develop
Add support for columns of arrays and general tidying
2 parents 1dc249c + 354bc15 commit 4ab672f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+215
-84
lines changed

.travis.yml

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,27 @@
1-
#sudo: false
2-
# travis-ci.org build & test configuration
31
language: python
42

53
matrix:
6-
include:
7-
- python: 2.7
8-
env: PYTHON=2.7 ROOT=5.34.32
9-
- python: 2.7
10-
env: PYTHON=2.7 ROOT=6.04
11-
- python: 3.4
12-
env: PYTHON=3.4 ROOT=5.34.32
13-
- python: 3.4
14-
env: PYTHON=3.4 ROOT=6.04
15-
- python: 3.5
16-
env: PYTHON=3.4 ROOT=5.34.32
17-
- python: 3.5
18-
env: PYTHON=3.4 ROOT=6.04
19-
- python: 3.6
20-
env: PYTHON=3.4 ROOT=5.34.32
21-
- python: 3.6
22-
env: PYTHON=3.4 ROOT=6.04
23-
#install: source ci/install.sh
24-
install:
25-
- if [ "${TRAVIS_OS_NAME}" == "osx" ]; then curl --silent http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -o miniconda.sh; fi
26-
- if [ "${TRAVIS_OS_NAME}" == "linux" ]; then wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; fi
4+
include:
5+
- python: 2.7
6+
env: PYTHON=2.7 ROOT=5.34.32
7+
- python: 2.7
8+
env: PYTHON=2.7 ROOT=6.04
9+
- python: 3.4
10+
env: PYTHON=3.4 ROOT=5.34.32
11+
- python: 3.4
12+
env: PYTHON=3.4 ROOT=6.04
2713

14+
install:
15+
- if [ "${TRAVIS_OS_NAME}" == "osx" ]; then curl --silent http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh
16+
-o miniconda.sh; fi
17+
- if [ "${TRAVIS_OS_NAME}" == "linux" ]; then wget -nv http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
18+
-O miniconda.sh; fi
2819
- bash miniconda.sh -b -p $HOME/miniconda
2920
- export PATH="$HOME/miniconda/bin:$PATH"
3021
- hash -r
3122
- conda config --set always_yes yes --set changeps1 no
3223
- conda update -q conda
33-
- conda info -a # Useful for debugging any issues with conda
24+
- conda info -a
3425
- conda config --add channels http://conda.anaconda.org/NLeSC
3526
- conda config --set show_channel_urls yes
3627
- conda create -q -n testenv python=${PYTHON} root=${ROOT} rootpy pandas nose
@@ -41,7 +32,16 @@ install:
4132
script: nosetests --with-coverage --cover-package=root_pandas
4233

4334
after_success:
44-
- time coveralls
35+
- time coveralls
4536

4637
notifications:
47-
email: false
38+
email: false
39+
40+
deploy:
41+
provider: pypi
42+
user: chrisburr
43+
password:
44+
secure: MyD2Q4zASzpXWaOBnbkGGm7luYB2SrrBVdX4faN0JmSmDcssn/exu2XDAIwhbZhg3uZC4bq7mBUpPiw/3Mx1f5kFgWlnjpnSRDaGhGLLc6rBp9Kqt6IOWcQ64yQ+S6LIuJ+tjbTMJAlNZgy3HDEwBWXKBvectWKJPZdVCenfMPA=
45+
on:
46+
tags: true
47+
branch: master

root_pandas/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,7 @@
11
from .readwrite import read_root
22
from .readwrite import to_root
3+
4+
__all__ = [
5+
'read_root',
6+
'to_root',
7+
]

root_pandas/readwrite.py

Lines changed: 52 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929

3030
def expand_braces(orig):
31-
r = r'.*(\{.+?[^\\]\})'
31+
r = r'.*?(\{.+[^\\]\})'
3232
p = re.compile(r)
3333

3434
s = orig[:]
@@ -40,12 +40,10 @@ def expand_braces(orig):
4040
open_brace = s.find(sub)
4141
close_brace = open_brace + len(sub) - 1
4242
if sub.find(',') != -1:
43-
for pat in sub.strip('{}').split(','):
43+
for pat in sub[1:-1].split(','):
4444
res.extend(expand_braces(s[:open_brace] + pat + s[close_brace+1:]))
45-
4645
else:
4746
res.extend(expand_braces(s[:open_brace] + sub.replace('}', '\\}') + s[close_brace+1:]))
48-
4947
else:
5048
res.append(s.replace('\\}', '}'))
5149

@@ -59,6 +57,7 @@ def get_nonscalar_columns(array):
5957
bad_names = col_names[bad_cols]
6058
return list(bad_names)
6159

60+
6261
def get_matching_variables(branches, patterns, fail=True):
6362
selected = []
6463

@@ -93,6 +92,30 @@ def filter_noexpand_columns(columns):
9392
return other, noexpand
9493

9594

95+
def do_flatten(arr, flatten):
96+
if flatten is True:
97+
warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like "
98+
"to flatten in a list: flatten=['foo', 'bar']", FutureWarning)
99+
arr_, idx = stretch(arr, return_indices=True)
100+
else:
101+
nonscalar = get_nonscalar_columns(arr)
102+
fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)]
103+
104+
for col in flatten:
105+
if col in nonscalar:
106+
pass
107+
elif col in fields:
108+
raise ValueError("Requested to flatten {col} but it has a scalar type"
109+
.format(col=col))
110+
else:
111+
raise ValueError("Requested to flatten {col} but it wasn't loaded from the input file"
112+
.format(col=col))
113+
114+
arr_, idx = stretch(arr, fields=fields, return_indices=True)
115+
arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
116+
return arr
117+
118+
96119
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs):
97120
"""
98121
Read a ROOT file, or list of ROOT files, into a pandas DataFrame.
@@ -175,22 +198,6 @@ def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=
175198
for var in ignored:
176199
all_vars.remove(var)
177200

178-
def do_flatten(arr, flatten):
179-
if flatten is True:
180-
warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like "
181-
"to flatten in a list: flatten=['foo', 'bar']", FutureWarning)
182-
arr_, idx = stretch(arr, return_indices=True)
183-
else:
184-
nonscalar = get_nonscalar_columns(arr)
185-
fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)]
186-
will_drop = [x for x in arr.dtype.names if x not in fields]
187-
if will_drop:
188-
warnings.warn("Ignored the following non-scalar branches: {bad_names}"
189-
.format(bad_names=", ".join(will_drop)), UserWarning)
190-
arr_, idx = stretch(arr, fields=fields, return_indices=True)
191-
arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
192-
return arr
193-
194201
if chunksize:
195202
tchain = ROOT.TChain(key)
196203
for path in paths:
@@ -216,26 +223,45 @@ def genchunks():
216223

217224
def convert_to_dataframe(array, start_index=None):
218225
nonscalar_columns = get_nonscalar_columns(array)
219-
if nonscalar_columns:
220-
warnings.warn("Ignored the following non-scalar branches: {bad_names}"
221-
.format(bad_names=", ".join(nonscalar_columns)), UserWarning)
222-
indices = list(filter(lambda x: x.startswith('__index__') and x not in nonscalar_columns, array.dtype.names))
226+
227+
# Columns containing 2D arrays can't be loaded so convert them 1D arrays of arrays
228+
reshaped_columns = {}
229+
for col in nonscalar_columns:
230+
if array[col].ndim >= 2:
231+
reshaped = np.zeros(len(array[col]), dtype='O')
232+
for i, row in enumerate(array[col]):
233+
reshaped[i] = row
234+
reshaped_columns[col] = reshaped
235+
236+
indices = list(filter(lambda x: x.startswith('__index__'), array.dtype.names))
223237
if len(indices) == 0:
224238
index = None
225239
if start_index is not None:
226240
index = RangeIndex(start=start_index, stop=start_index + len(array))
227-
df = DataFrame.from_records(array, exclude=nonscalar_columns, index=index)
241+
df = DataFrame.from_records(array, exclude=reshaped_columns, index=index)
228242
elif len(indices) == 1:
229243
# We store the index under the __index__* branch, where
230244
# * is the name of the index
231-
df = DataFrame.from_records(array, index=indices[0], exclude=nonscalar_columns)
245+
df = DataFrame.from_records(array, exclude=reshaped_columns, index=indices[0])
232246
index_name = indices[0][len('__index__'):]
233247
if not index_name:
234248
# None means the index has no name
235249
index_name = None
236250
df.index.name = index_name
237251
else:
238252
raise ValueError("More than one index found in file")
253+
254+
# Manually the columns which were reshaped
255+
for key, reshaped in reshaped_columns.items():
256+
df[key] = reshaped
257+
258+
# Reshaping can cause the order of columns to change so we have to change it back
259+
if reshaped_columns:
260+
# Filter to remove __index__ columns
261+
columns = [c for c in array.dtype.names if c in df.columns]
262+
assert len(columns) == len(df.columns), (columns, df.columns)
263+
df = df.reindex_axis(columns, axis=1, copy=False)
264+
239265
return df
240266

241267

root_pandas/utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# Copyright (c) 2012 rootpy developers and contributors
2-
#
2+
#
33
# Permission is hereby granted, free of charge, to any person obtaining a copy of
44
# this software and associated documentation files (the "Software"), to deal in
55
# the Software without restriction, including without limitation the rights to
66
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
77
# the Software, and to permit persons to whom the Software is furnished to do so,
88
# subject to the following conditions:
9-
#
9+
#
1010
# The above copyright notice and this permission notice shall be included in all
1111
# copies or substantial portions of the Software.
12-
#
12+
#
1313
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1414
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
1515
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
@@ -24,6 +24,7 @@
2424
import numpy as np
2525
VLEN = np.vectorize(len)
2626

27+
2728
def stretch(arr, fields=None, return_indices=False):
2829
"""Stretch an array.
2930
Stretch an array by ``hstack()``-ing multiple array fields while
@@ -104,5 +105,5 @@ def stretch(arr, fields=None, return_indices=False):
104105
if return_indices:
105106
idx = np.concatenate(list(map(np.arange, len_array)))
106107
return ret, idx
107-
108+
108109
return ret

tests/samples/HZZ-lz4.root

280 KB
Binary file not shown.

tests/samples/HZZ-lzma.root

180 KB
Binary file not shown.

tests/samples/HZZ-uncompressed.root

666 KB
Binary file not shown.

tests/samples/HZZ-zlib.root

217 KB
Binary file not shown.

tests/samples/HZZ.root

213 KB
Binary file not shown.

tests/samples/README.md

Lines changed: 3 additions & 0 deletions

0 commit comments

Comments
 (0)