14
14
15
15
'use strict' ;
16
16
17
- const PDFParser = require ( 'pdf2json' ) ;
17
+ // HACK few hacks to let PDF.js be loaded not as a module in global space.
18
+ require ( './domstubs.js' ) . setStubs ( global ) ;
19
+
20
+ let pdfjsLib = require ( 'pdfjs-dist/es5/build/pdf.js' ) ;
21
+
18
22
const CiceroMarkTransformer = require ( '@accordproject/markdown-cicero' ) . CiceroMarkTransformer ;
19
23
const PdfPrinter = require ( 'pdfmake' ) ;
20
24
const ToPdfMakeVisitor = require ( './ToPdfMakeVisitor' ) ;
@@ -83,87 +87,81 @@ class PdfTransformer {
83
87
* @param {string } [format] result format, defaults to 'concerto'. Pass
84
88
* 'json' to return the JSON data.
85
89
* @param {object } [options] - the PDF parsing options
86
- * @param {number } [options.paragraphVerticalOffset] - the vertical offset used to detect pararaphs (defaults to 1)
87
- * @param {boolean } [options.preservePages] - whether to preserve page breaks (defaults to true)
90
+ * @param {number } [options.paragraphVerticalOffset] - the vertical offset used to detect
91
+ * pararaphs as a multiple of the line height (defaults to 2)
92
+ * @param {boolean } [options.preservePages] - whether to preserve PDF page breaks (defaults to true)
88
93
* @returns {promise } a Promise to the CiceroMark DOM
89
94
*/
90
- async toCiceroMark ( input , format = 'concerto' , options = { paragraphVerticalOffset : 1 , preservePages : true } ) {
91
- return new Promise ( ( resolve , reject ) => {
92
- const pdfParser = new PDFParser ( null , false ) ;
93
- const errorCallback = ( errData ) => reject ( `PDF parsing failed with error ${ errData . parserError } ` ) ;
94
- const conversionCallback = ( pdfData ) => {
95
-
96
- const document = {
97
- $class : 'org.accordproject.commonmark.Document' ,
98
- xmlns : pdfData . formImage . Id . Name ,
99
- nodes : [ ]
95
+ async toCiceroMark ( input , format = 'concerto' , options = { paragraphVerticalOffset : 2 , preservePages : true } ) {
96
+
97
+ let loadingTask = pdfjsLib . getDocument ( input . buffer ) ;
98
+
99
+ const doc = await loadingTask . promise ;
100
+ let numPages = doc . numPages ;
101
+ const metadata = await doc . getMetadata ( ) ;
102
+
103
+ const pages = [ ] ;
104
+ for ( let n = 1 ; n <= numPages ; n ++ ) {
105
+ const page = await doc . getPage ( n ) ;
106
+ const content = await page . getTextContent ( {
107
+ normalizeWhitespace : true ,
108
+ disableCombineTextItems : true ,
109
+ } ) ;
110
+
111
+ let currentPara = null ;
112
+ let lastY = 0 ;
113
+ const result = {
114
+ nodes : [ ]
115
+ } ;
116
+
117
+ content . items . forEach ( text => {
118
+ const tx = text . transform ;
119
+ const textY = tx [ 5 ] ;
120
+ const height = text . height ;
121
+ const newPara = Math . abs ( lastY - textY ) > ( height * options . paragraphVerticalOffset ) ;
122
+
123
+ if ( ! currentPara || newPara ) {
124
+ currentPara = {
125
+ $class : 'org.accordproject.commonmark.Paragraph' ,
126
+ nodes : [ ]
127
+ } ;
128
+ result . nodes . push ( currentPara ) ;
129
+ }
130
+
131
+ const textNode = {
132
+ $class : 'org.accordproject.commonmark.Text' ,
133
+ text : text . str . replace ( / (?: \r \n | \r | \n ) / g, ' ' )
100
134
} ;
101
135
102
- // pdfData = pdfParser.getMergedTextBlocksIfNeeded();
103
-
104
- let currentPara = null ;
105
- pdfData . formImage . Pages . forEach ( page => {
106
- let lastY = 0 ;
107
- page . Texts . forEach ( text => {
108
- if ( ! currentPara || Math . abs ( lastY - text . y ) > options . paragraphVerticalOffset ) {
109
- currentPara = {
110
- $class : 'org.accordproject.commonmark.Paragraph' ,
111
- nodes : [ ]
112
- } ;
113
- document . nodes . push ( currentPara ) ;
114
- }
115
-
116
- text . R . forEach ( run => {
117
- let [ /*fontFaceId*/ , /*fontSize*/ , bold , italic ] = run . TS ;
118
- const textNode = {
119
- $class : 'org.accordproject.commonmark.Text' ,
120
- text : run . T ? decodeURIComponent ( run . T ) : ''
121
- } ;
122
- if ( bold && ! italic ) {
123
- const bold = {
124
- $class : 'org.accordproject.commonmark.Strong' ,
125
- nodes : [ textNode ]
126
- } ;
127
- PdfTransformer . pushNode ( currentPara , bold , lastY , text . y ) ;
128
- }
129
- else if ( italic && ! bold ) {
130
- const italic = {
131
- $class : 'org.accordproject.commonmark.Emph' ,
132
- nodes : [ textNode ]
133
- } ;
134
- PdfTransformer . pushNode ( currentPara , italic , lastY , text . y ) ;
135
- }
136
- else if ( italic && bold ) {
137
- const boldItalic = {
138
- $class : 'org.accordproject.commonmark.Strong' ,
139
- nodes : [ {
140
- $class : 'org.accordproject.commonmark.Emph' ,
141
- nodes : [ textNode ]
142
- } ]
143
- } ;
144
- PdfTransformer . pushNode ( currentPara , boldItalic , lastY , text . y ) ;
145
- }
146
- else {
147
- PdfTransformer . pushNode ( currentPara , textNode , lastY , text . y ) ;
148
- }
149
- } ) ;
150
- lastY = text . y ;
151
- } ) ;
136
+ currentPara . nodes . push ( textNode ) ;
152
137
153
- if ( options . preservePages ) {
154
- document . nodes . push ( {
155
- $class : 'org.accordproject.commonmark.ThematicBreak'
156
- } ) ;
157
- }
138
+ if ( text . str . trim ( ) . length > 0 ) {
139
+ lastY = textY ;
140
+ }
141
+ } ) ;
142
+
143
+ if ( options . preservePages ) {
144
+ result . nodes . push ( {
145
+ $class : 'org.accordproject.commonmark.ThematicBreak'
158
146
} ) ;
159
- resolve ( document ) ;
160
- } ;
147
+ }
161
148
162
- // trigger parsing
163
- pdfParser . on ( 'pdfParser_dataError' , errorCallback ) ;
164
- pdfParser . on ( 'pdfParser_dataReady' , conversionCallback ) ;
165
- pdfParser . parseBuffer ( input ) ;
149
+ pages . push ( result ) ;
150
+ }
151
+
152
+ let merged = [ ] ;
153
+
154
+ pages . forEach ( page => {
155
+ merged = merged . concat ( page . nodes ) ;
166
156
} ) ;
157
+
158
+ const document = {
159
+ $class : 'org.accordproject.commonmark.Document' ,
160
+ xmlns : metadata . Title ? metadata . Title : 'Unknown' ,
161
+ nodes : merged
162
+ } ;
163
+
164
+ return document ;
167
165
}
168
166
169
167
/**
@@ -198,7 +196,8 @@ class PdfTransformer {
198
196
199
197
dd . pageSize = 'LETTER' ;
200
198
dd . pageOrientation = 'portrait' ,
201
- dd . pageMargins = [ 80 , 80 , 80 , 80 ] ;
199
+ // left, top, right, bottom
200
+ dd . pageMargins = [ 81 , 72 , 81 , 72 ] ; // units are points (72 per inch)
202
201
203
202
// allow overrding top-level options
204
203
Object . assign ( dd , options ) ;
@@ -235,39 +234,45 @@ class PdfTransformer {
235
234
const defaultStyles = {
236
235
Footer : {
237
236
alignment : 'left' ,
238
- margin : [ 10 , 10 , 0 , 0 ]
237
+ fontSize : 10 ,
238
+ // left, top, right, bottom
239
+ margin : [ 81 , 36 , 0 , 0 ]
239
240
} ,
240
241
PageNumber : {
241
242
alignment : 'center' ,
242
- margin : [ 0 , 0 , 0 , 0 ]
243
+ fontSize : 10 ,
244
+ // left, top, right, bottom
245
+ margin : [ 0 , - 11 , 0 , 0 ]
243
246
} ,
244
247
Header : {
245
248
alignment : 'right' ,
246
- margin : [ 0 , 10 , 10 , 0 ]
249
+ fontSize : 10 ,
250
+ // left, top, right, bottom
251
+ margin : [ 0 , 36 , 81 , 0 ]
247
252
} ,
248
253
heading_one : {
249
- fontSize : 30 ,
254
+ fontSize : 25 ,
250
255
bold : true ,
251
256
alignment : 'center'
252
257
} ,
253
258
heading_two : {
254
- fontSize : 28 ,
259
+ fontSize : 20 ,
255
260
bold : true
256
261
} ,
257
262
heading_three : {
258
- fontSize : 26 ,
263
+ fontSize : 16 ,
259
264
bold : true
260
265
} ,
261
266
heading_four : {
262
- fontSize : 24 ,
267
+ fontSize : 15 ,
263
268
bold : true
264
269
} ,
265
270
heading_five : {
266
- fontSize : 22 ,
271
+ fontSize : 14 ,
267
272
bold : true
268
273
} ,
269
274
heading_six : {
270
- fontSize : 20 ,
275
+ fontSize : 13 ,
271
276
bold : true
272
277
} ,
273
278
Code : {
@@ -286,7 +291,7 @@ class PdfTransformer {
286
291
alignment : 'justify'
287
292
} ,
288
293
toc : {
289
- fontSize : 30 ,
294
+ fontSize : 25 ,
290
295
bold : true ,
291
296
alignment : 'center'
292
297
} ,
@@ -308,64 +313,6 @@ class PdfTransformer {
308
313
pdfDoc . pipe ( outputStream ) ;
309
314
pdfDoc . end ( ) ;
310
315
}
311
-
312
- /**
313
- * Utility to get the last child of a node.
314
- * @param {object } node a commonmark node
315
- * @returns {object } the last child node, or null
316
- */
317
- static getLastChildNode ( node ) {
318
- return node . nodes . length > 0 ? node . nodes [ node . nodes . length - 1 ] : null ;
319
- }
320
-
321
- /**
322
- * Utility to merge text nodes. It recurses so that is can deal with
323
- * bold, italic, bold+italic text.
324
- * @param {object } srcNode a commonmark node
325
- * @param {object } destNode a commonmark node
326
- * @returns {object } the modified destination node, or null
327
- */
328
- static mergeTextNode ( srcNode , destNode ) {
329
- if ( srcNode && destNode ) {
330
- if ( srcNode . $class === destNode . $class ) {
331
- if ( srcNode . $class === 'org.accordproject.commonmark.Text' ) {
332
- destNode . text = destNode . text + srcNode . text ;
333
- return destNode ;
334
- }
335
- else {
336
- const srcChild = PdfTransformer . getLastChildNode ( srcNode ) ;
337
- const destChild = PdfTransformer . getLastChildNode ( destNode ) ;
338
- return PdfTransformer . mergeTextNode ( srcChild , destChild ) ;
339
- }
340
- }
341
- }
342
-
343
- return null ;
344
- }
345
-
346
- /**
347
- * Utility to merge adjacent text runs from a PDF
348
- * @param {* } currentPara CommonMark paragraph node
349
- * @param {* } node the current node
350
- * @param {* } lastY the last Y offset position from PDF
351
- * @param {* } textY the current Y offset position from PDF
352
- */
353
- static pushNode ( currentPara , node , lastY , textY ) {
354
- if ( lastY !== textY ) {
355
- currentPara . nodes . push ( {
356
- $class : 'org.accordproject.commonmark.Softbreak'
357
- } ) ;
358
- currentPara . nodes . push ( node ) ;
359
- }
360
- else {
361
- const lastNode = PdfTransformer . getLastChildNode ( currentPara ) ;
362
- const merged = PdfTransformer . mergeTextNode ( node , lastNode ) ;
363
-
364
- if ( ! merged ) {
365
- currentPara . nodes . push ( node ) ;
366
- }
367
- }
368
- }
369
316
}
370
317
371
318
module . exports = PdfTransformer ;
0 commit comments