Skip to content

Commit bf191b5

Browse files
Merge pull request #1847 from cloudsufi/patch/update-poi-api-excel-input
[PLUGIN-1771] Add Streaming support for excel source
2 parents badc4b0 + 0d12023 commit bf191b5

File tree

2 files changed

+65
-11
lines changed

2 files changed

+65
-11
lines changed

core-plugins/pom.xml

+22-2
Original file line numberDiff line numberDiff line change
@@ -173,15 +173,35 @@
173173
<version>${hadoop.version}</version>
174174
<scope>test</scope>
175175
</dependency>
176+
<dependency>
177+
<groupId>commons-io</groupId>
178+
<artifactId>commons-io</artifactId>
179+
<version>2.15.0</version>
180+
</dependency>
181+
<dependency>
182+
<groupId>org.apache.commons</groupId>
183+
<artifactId>commons-compress</artifactId>
184+
<version>1.26.0</version>
185+
</dependency>
176186
<dependency>
177187
<groupId>org.apache.poi</groupId>
178188
<artifactId>poi</artifactId>
179-
<version>3.12</version>
189+
<version>5.2.4</version>
180190
</dependency>
181191
<dependency>
182192
<groupId>org.apache.poi</groupId>
183193
<artifactId>poi-ooxml</artifactId>
184-
<version>3.11</version>
194+
<version>5.2.4</version>
195+
</dependency>
196+
<dependency>
197+
<groupId>com.github.pjfanning</groupId>
198+
<artifactId>excel-streaming-reader</artifactId>
199+
<version>4.2.1</version>
200+
</dependency>
201+
<dependency>
202+
<groupId>com.github.pjfanning</groupId>
203+
<artifactId>poi-shared-strings</artifactId>
204+
<version>2.8.0</version>
185205
</dependency>
186206
<dependency>
187207
<scope>test</scope>

core-plugins/src/main/java/io/cdap/plugin/batch/source/ExcelInputFormat.java

+43-9
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
package io.cdap.plugin.batch.source;
1818

19+
import com.github.pjfanning.xlsx.StreamingReader;
1920
import com.google.common.base.Preconditions;
2021
import com.google.common.base.Strings;
2122
import org.apache.hadoop.conf.Configuration;
@@ -26,19 +27,23 @@
2627
import org.apache.hadoop.io.Text;
2728
import org.apache.hadoop.mapreduce.InputSplit;
2829
import org.apache.hadoop.mapreduce.Job;
30+
import org.apache.hadoop.mapreduce.JobContext;
2931
import org.apache.hadoop.mapreduce.RecordReader;
3032
import org.apache.hadoop.mapreduce.TaskAttemptContext;
3133
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
3234
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
33-
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
35+
import org.apache.poi.EmptyFileException;
36+
import org.apache.poi.poifs.filesystem.FileMagic;
3437
import org.apache.poi.ss.usermodel.Cell;
38+
import org.apache.poi.ss.usermodel.DateUtil;
3539
import org.apache.poi.ss.usermodel.Row;
3640
import org.apache.poi.ss.usermodel.Sheet;
3741
import org.apache.poi.ss.usermodel.Workbook;
3842
import org.apache.poi.ss.usermodel.WorkbookFactory;
3943
import org.apache.poi.ss.util.CellReference;
4044

4145
import java.io.IOException;
46+
import java.io.InputStream;
4247
import java.util.Iterator;
4348

4449

@@ -67,6 +72,11 @@ public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, Tas
6772
return new ExcelRecordReader();
6873
}
6974

75+
@Override
76+
public boolean isSplitable(JobContext context, Path file) {
77+
return false;
78+
}
79+
7080
public static void setConfigurations(Job job, String filePattern, String sheet, boolean reprocess,
7181
String sheetValue, String columnList, boolean skipFirstRow,
7282
String terminateIfEmptyRow, String rowLimit, String ifErrorRecord,
@@ -145,9 +155,31 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
145155
String sheet = job.get(SHEET);
146156
String sheetValue = job.get(SHEET_VALUE);
147157

148-
Sheet workSheet; // sheet can be used as common for XSSF and HSSF workbook
158+
Sheet workSheet;
159+
Workbook workbook;
160+
boolean isStreaming = false;
149161
try {
150-
Workbook workbook = WorkbookFactory.create(fileIn);
162+
// Use Magic Bytes to detect the file type
163+
InputStream is = FileMagic.prepareToCheckMagic(fileIn);
164+
byte[] emptyFileCheck = new byte[1];
165+
is.mark(emptyFileCheck.length);
166+
if (is.read(emptyFileCheck) < emptyFileCheck.length) {
167+
throw new EmptyFileException();
168+
}
169+
is.reset();
170+
171+
final FileMagic fm = FileMagic.valueOf(is);
172+
switch (fm) {
173+
case OOXML:
174+
workbook = StreamingReader.builder().rowCacheSize(10).open(is);
175+
isStreaming = true;
176+
break;
177+
case OLE2:
178+
workbook = WorkbookFactory.create(is);
179+
break;
180+
default:
181+
throw new IOException("Can't open workbook - unsupported file type: " + fm);
182+
}
151183
if (sheet.equalsIgnoreCase(SHEET_NAME)) {
152184
workSheet = workbook.getSheet(sheetValue);
153185
} else {
@@ -157,7 +189,9 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
157189
throw new IllegalArgumentException("Exception while reading excel sheet. " + e.getMessage(), e);
158190
}
159191

160-
rowCount = job.getInt(ROWS_LIMIT, workSheet.getPhysicalNumberOfRows());
192+
// As we cannot get the number of rows in a sheet while streaming.
193+
// -1 is used as rowCount to indicate that all rows should be read.
194+
rowCount = job.getInt(ROWS_LIMIT, isStreaming ? -1 : workSheet.getPhysicalNumberOfRows());
161195
rows = workSheet.iterator();
162196
lastRowNum = workSheet.getLastRowNum();
163197
rowIdx = 0;
@@ -171,7 +205,7 @@ public void initialize(InputSplit genericSplit, TaskAttemptContext context) thro
171205
}
172206

173207
@Override
174-
public boolean nextKeyValue() throws IOException, InterruptedException {
208+
public boolean nextKeyValue() {
175209
if (!rows.hasNext() || rowCount == 0) {
176210
return false;
177211
}
@@ -200,18 +234,18 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
200234
Cell cell = cellIterator.next();
201235
String colName = CellReference.convertNumToColString(cell.getColumnIndex());
202236
switch (cell.getCellType()) {
203-
case Cell.CELL_TYPE_STRING:
237+
case STRING:
204238
sb.append(colName)
205239
.append(COLUMN_SEPERATOR).append(cell.getStringCellValue()).append(CELL_SEPERATOR);
206240
break;
207241

208-
case Cell.CELL_TYPE_BOOLEAN:
242+
case BOOLEAN:
209243
sb.append(colName)
210244
.append(COLUMN_SEPERATOR).append(cell.getBooleanCellValue()).append(CELL_SEPERATOR);
211245
break;
212246

213-
case Cell.CELL_TYPE_NUMERIC:
214-
if (HSSFDateUtil.isCellDateFormatted(cell)) {
247+
case NUMERIC:
248+
if (DateUtil.isCellDateFormatted(cell)) {
215249
sb.append(colName).append(COLUMN_SEPERATOR).append(cell.getDateCellValue()).append(CELL_SEPERATOR);
216250
} else {
217251
sb.append(colName)

0 commit comments

Comments
 (0)