Skip to content

Commit dfdb530

Browse files
dselmanjeromesimeon
authored andcommitted
feat(pdf-import) : improve quality of import, remove pdf2json
Signed-off-by: Dan Selman <[email protected]>
1 parent da2f5ce commit dfdb530

File tree

8 files changed

+36924
-17015
lines changed

8 files changed

+36924
-17015
lines changed

package-lock.json

+16,685-5,851
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/markdown-pdf/.eslintignore

+1
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ test/data
77
src/grammars
88
.travis
99
scripts
10+
src/domstubs.js

packages/markdown-pdf/jest.config.js

+4-3
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,10 @@ module.exports = {
183183
// transform: null,
184184

185185
// An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation
186-
// transformIgnorePatterns: [
187-
// "/node_modules/"
188-
// ],
186+
// transformIgnorePatterns : [
187+
// 'src/pdf.js',
188+
// 'src/pdf.worker.js'
189+
// ]
189190

190191
// An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them
191192
// unmockedModulePathPatterns: undefined,

packages/markdown-pdf/package.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282
"dependencies": {
8383
"@accordproject/markdown-cicero": "0.12.7",
8484
"@accordproject/markdown-common": "0.12.7",
85-
"pdf2json": "1.2.0",
85+
"pdfjs-dist": "^2.4.456",
8686
"type-of": "^2.0.1",
8787
"pdfmake": "0.1.66"
8888
},
@@ -97,7 +97,8 @@
9797
"!./out/**/*",
9898
"!./lib/**/*",
9999
"!./umd/**/*",
100-
"!./bin/index.js"
100+
"!./bin/index.js",
101+
"!./src/domstubs.js"
101102
],
102103
"path": "header.txt",
103104
"blocking": true,

packages/markdown-pdf/src/PdfTransformer.js

+90-143
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,11 @@
1414

1515
'use strict';
1616

17-
const PDFParser = require('pdf2json');
17+
// HACK few hacks to let PDF.js be loaded not as a module in global space.
18+
require('./domstubs.js').setStubs(global);
19+
20+
let pdfjsLib = require('pdfjs-dist/es5/build/pdf.js');
21+
1822
const CiceroMarkTransformer = require('@accordproject/markdown-cicero').CiceroMarkTransformer;
1923
const PdfPrinter = require('pdfmake');
2024
const ToPdfMakeVisitor = require('./ToPdfMakeVisitor');
@@ -83,87 +87,81 @@ class PdfTransformer {
8387
* @param {string} [format] result format, defaults to 'concerto'. Pass
8488
* 'json' to return the JSON data.
8589
* @param {object} [options] - the PDF parsing options
86-
* @param {number} [options.paragraphVerticalOffset] - the vertical offset used to detect pararaphs (defaults to 1)
87-
* @param {boolean} [options.preservePages] - whether to preserve page breaks (defaults to true)
90+
* @param {number} [options.paragraphVerticalOffset] - the vertical offset used to detect
91+
* pararaphs as a multiple of the line height (defaults to 2)
92+
* @param {boolean} [options.preservePages] - whether to preserve PDF page breaks (defaults to true)
8893
* @returns {promise} a Promise to the CiceroMark DOM
8994
*/
90-
async toCiceroMark(input, format = 'concerto', options = { paragraphVerticalOffset: 1, preservePages: true }) {
91-
return new Promise( (resolve, reject) => {
92-
const pdfParser = new PDFParser(null, false);
93-
const errorCallback = (errData) => reject(`PDF parsing failed with error ${errData.parserError}`);
94-
const conversionCallback = (pdfData) => {
95-
96-
const document = {
97-
$class : 'org.accordproject.commonmark.Document',
98-
xmlns : pdfData.formImage.Id.Name,
99-
nodes : []
95+
async toCiceroMark(input, format = 'concerto', options = { paragraphVerticalOffset: 2, preservePages: true }) {
96+
97+
let loadingTask = pdfjsLib.getDocument(input.buffer);
98+
99+
const doc = await loadingTask.promise;
100+
let numPages = doc.numPages;
101+
const metadata = await doc.getMetadata();
102+
103+
const pages = [];
104+
for( let n=1; n <= numPages; n++) {
105+
const page = await doc.getPage(n);
106+
const content = await page.getTextContent({
107+
normalizeWhitespace: true,
108+
disableCombineTextItems: true,
109+
});
110+
111+
let currentPara = null;
112+
let lastY = 0;
113+
const result = {
114+
nodes: []
115+
};
116+
117+
content.items.forEach( text => {
118+
const tx = text.transform;
119+
const textY = tx[5];
120+
const height = text.height;
121+
const newPara = Math.abs(lastY - textY) > (height * options.paragraphVerticalOffset);
122+
123+
if(!currentPara || newPara) {
124+
currentPara = {
125+
$class : 'org.accordproject.commonmark.Paragraph',
126+
nodes : []
127+
};
128+
result.nodes.push(currentPara);
129+
}
130+
131+
const textNode = {
132+
$class : 'org.accordproject.commonmark.Text',
133+
text : text.str.replace(/(?:\r\n|\r|\n)/g, ' ')
100134
};
101135

102-
// pdfData = pdfParser.getMergedTextBlocksIfNeeded();
103-
104-
let currentPara = null;
105-
pdfData.formImage.Pages.forEach(page => {
106-
let lastY = 0;
107-
page.Texts.forEach( text => {
108-
if(!currentPara || Math.abs(lastY - text.y) > options.paragraphVerticalOffset) {
109-
currentPara = {
110-
$class : 'org.accordproject.commonmark.Paragraph',
111-
nodes : []
112-
};
113-
document.nodes.push(currentPara);
114-
}
115-
116-
text.R.forEach( run => {
117-
let [/*fontFaceId*/, /*fontSize*/, bold, italic] = run.TS;
118-
const textNode = {
119-
$class : 'org.accordproject.commonmark.Text',
120-
text : run.T ? decodeURIComponent(run.T) : ''
121-
};
122-
if(bold && !italic) {
123-
const bold = {
124-
$class : 'org.accordproject.commonmark.Strong',
125-
nodes : [textNode]
126-
};
127-
PdfTransformer.pushNode(currentPara, bold, lastY, text.y);
128-
}
129-
else if(italic && !bold) {
130-
const italic = {
131-
$class : 'org.accordproject.commonmark.Emph',
132-
nodes : [textNode]
133-
};
134-
PdfTransformer.pushNode(currentPara, italic, lastY, text.y);
135-
}
136-
else if(italic && bold) {
137-
const boldItalic = {
138-
$class : 'org.accordproject.commonmark.Strong',
139-
nodes : [{
140-
$class : 'org.accordproject.commonmark.Emph',
141-
nodes : [textNode]
142-
}]
143-
};
144-
PdfTransformer.pushNode(currentPara, boldItalic, lastY, text.y);
145-
}
146-
else {
147-
PdfTransformer.pushNode(currentPara, textNode, lastY, text.y);
148-
}
149-
});
150-
lastY = text.y;
151-
});
136+
currentPara.nodes.push(textNode);
152137

153-
if(options.preservePages) {
154-
document.nodes.push( {
155-
$class : 'org.accordproject.commonmark.ThematicBreak'
156-
});
157-
}
138+
if(text.str.trim().length > 0) {
139+
lastY = textY;
140+
}
141+
});
142+
143+
if(options.preservePages) {
144+
result.nodes.push( {
145+
$class : 'org.accordproject.commonmark.ThematicBreak'
158146
});
159-
resolve(document);
160-
};
147+
}
161148

162-
// trigger parsing
163-
pdfParser.on('pdfParser_dataError', errorCallback);
164-
pdfParser.on('pdfParser_dataReady', conversionCallback);
165-
pdfParser.parseBuffer(input);
149+
pages.push(result);
150+
}
151+
152+
let merged = [];
153+
154+
pages.forEach( page => {
155+
merged = merged.concat(page.nodes);
166156
});
157+
158+
const document = {
159+
$class : 'org.accordproject.commonmark.Document',
160+
xmlns : metadata.Title ? metadata.Title : 'Unknown',
161+
nodes : merged
162+
};
163+
164+
return document;
167165
}
168166

169167
/**
@@ -198,7 +196,8 @@ class PdfTransformer {
198196

199197
dd.pageSize = 'LETTER';
200198
dd.pageOrientation = 'portrait',
201-
dd.pageMargins = [ 80, 80, 80, 80 ];
199+
// left, top, right, bottom
200+
dd.pageMargins = [ 81, 72, 81, 72 ]; // units are points (72 per inch)
202201

203202
// allow overrding top-level options
204203
Object.assign(dd, options);
@@ -235,39 +234,45 @@ class PdfTransformer {
235234
const defaultStyles = {
236235
Footer: {
237236
alignment: 'left',
238-
margin : [10, 10, 0, 0]
237+
fontSize: 10,
238+
// left, top, right, bottom
239+
margin : [81, 36, 0, 0]
239240
},
240241
PageNumber: {
241242
alignment: 'center',
242-
margin : [0, 0, 0, 0]
243+
fontSize: 10,
244+
// left, top, right, bottom
245+
margin : [0, -11, 0, 0]
243246
},
244247
Header: {
245248
alignment: 'right',
246-
margin : [0, 10, 10, 0]
249+
fontSize: 10,
250+
// left, top, right, bottom
251+
margin : [0, 36, 81, 0]
247252
},
248253
heading_one: {
249-
fontSize: 30,
254+
fontSize: 25,
250255
bold: true,
251256
alignment: 'center'
252257
},
253258
heading_two: {
254-
fontSize: 28,
259+
fontSize: 20,
255260
bold: true
256261
},
257262
heading_three: {
258-
fontSize: 26,
263+
fontSize: 16,
259264
bold: true
260265
},
261266
heading_four: {
262-
fontSize: 24,
267+
fontSize: 15,
263268
bold: true
264269
},
265270
heading_five: {
266-
fontSize: 22,
271+
fontSize: 14,
267272
bold: true
268273
},
269274
heading_six: {
270-
fontSize: 20,
275+
fontSize: 13,
271276
bold: true
272277
},
273278
Code: {
@@ -286,7 +291,7 @@ class PdfTransformer {
286291
alignment: 'justify'
287292
},
288293
toc: {
289-
fontSize: 30,
294+
fontSize: 25,
290295
bold: true,
291296
alignment: 'center'
292297
},
@@ -308,64 +313,6 @@ class PdfTransformer {
308313
pdfDoc.pipe(outputStream);
309314
pdfDoc.end();
310315
}
311-
312-
/**
313-
* Utility to get the last child of a node.
314-
* @param {object} node a commonmark node
315-
* @returns {object} the last child node, or null
316-
*/
317-
static getLastChildNode(node) {
318-
return node.nodes.length > 0 ? node.nodes[node.nodes.length-1] : null;
319-
}
320-
321-
/**
322-
* Utility to merge text nodes. It recurses so that is can deal with
323-
* bold, italic, bold+italic text.
324-
* @param {object} srcNode a commonmark node
325-
* @param {object} destNode a commonmark node
326-
* @returns {object} the modified destination node, or null
327-
*/
328-
static mergeTextNode(srcNode, destNode) {
329-
if(srcNode && destNode ) {
330-
if( srcNode.$class === destNode.$class ) {
331-
if(srcNode.$class === 'org.accordproject.commonmark.Text') {
332-
destNode.text = destNode.text + srcNode.text;
333-
return destNode;
334-
}
335-
else {
336-
const srcChild = PdfTransformer.getLastChildNode(srcNode);
337-
const destChild = PdfTransformer.getLastChildNode(destNode);
338-
return PdfTransformer.mergeTextNode(srcChild, destChild);
339-
}
340-
}
341-
}
342-
343-
return null;
344-
}
345-
346-
/**
347-
* Utility to merge adjacent text runs from a PDF
348-
* @param {*} currentPara CommonMark paragraph node
349-
* @param {*} node the current node
350-
* @param {*} lastY the last Y offset position from PDF
351-
* @param {*} textY the current Y offset position from PDF
352-
*/
353-
static pushNode(currentPara, node, lastY, textY) {
354-
if(lastY !== textY) {
355-
currentPara.nodes.push( {
356-
$class : 'org.accordproject.commonmark.Softbreak'
357-
});
358-
currentPara.nodes.push(node);
359-
}
360-
else {
361-
const lastNode = PdfTransformer.getLastChildNode(currentPara);
362-
const merged = PdfTransformer.mergeTextNode(node, lastNode);
363-
364-
if(!merged) {
365-
currentPara.nodes.push(node);
366-
}
367-
}
368-
}
369316
}
370317

371318
module.exports = PdfTransformer;

packages/markdown-pdf/src/PdfTransformer.test.js

+13-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
'use strict';
1818

1919
const fs = require('fs');
20+
const path = require('path');
21+
2022
const PdfTransformer = require('./PdfTransformer');
2123
const CiceroMarkTransformer = require('@accordproject/markdown-cicero').CiceroMarkTransformer;
2224

@@ -134,11 +136,21 @@ describe('pdf import', () => {
134136
// console.log(JSON.stringify(ciceroMarkDom, null, 4));
135137
// }
136138
expect(ciceroMarkDom).toMatchSnapshot(); // (1)
137-
return saveCiceroMarkAsPdf(ciceroMarkDom, file + '-import'); // roundtrip for debug
139+
return saveCiceroMarkAsPdf(ciceroMarkDom, file + '-roundtrip'); // roundtrip for debug
138140
});
139141
});
140142
});
141143

144+
describe('pdf import 2', () => {
145+
it('converts Land_Sale_Contract to cicero mark', async () => {
146+
const pdfContent = fs.readFileSync( path.join(__dirname, '/../test/data', 'Land_Sale_Contract.pdf'), null );
147+
const ciceroMarkDom = await pdfTransformer.toCiceroMark(pdfContent, 'json');
148+
// console.log(JSON.stringify(ciceroMarkDom, null, 4));
149+
expect(ciceroMarkDom).toMatchSnapshot(); // (1)
150+
return saveCiceroMarkAsPdf(ciceroMarkDom, 'Land_Sale_Contract-debug'); // roundtrip for debug
151+
});
152+
});
153+
142154
describe('pdf generation', () => {
143155
getMarkdownFiles().forEach(([file, markdownContent], i) => {
144156
it(`converts ${file} to pdf`, async () => {

0 commit comments

Comments
 (0)