@@ -14,6 +14,10 @@ const warcio = require("warcio");
1414const Redis = require ( "ioredis" ) ;
1515
1616const TextExtract = require ( "./textextract" ) ;
17+
18+ const readabilityJs = fs . readFileSync ( "/app/node_modules/@mozilla/readability/Readability-readerable.js" , "utf-8" )
19+ + fs . readFileSync ( "/app/node_modules/@mozilla/readability/Readability.js" , "utf-8" ) ;
20+
1721const behaviors = fs . readFileSync ( "/app/node_modules/browsertrix-behaviors/dist/behaviors.js" , "utf-8" ) ;
1822
1923const HTML_TYPES = [ "text/html" , "application/xhtml" , "application/xhtml+xml" ] ;
@@ -281,6 +285,12 @@ class Crawler {
281285 default : false ,
282286 } ,
283287
288+ "readerView" : {
289+ describe : "If set, apply Mozilla's reader view and add the 'article' object to the pages.jsonl file, see https://github.com/mozilla/readability" ,
290+ type : "boolean" ,
291+ default : false ,
292+ } ,
293+
284294 "cwd" : {
285295 describe : "Crawl working directory for captures (pywb root). If not set, defaults to process.cwd()" ,
286296 type : "string" ,
@@ -571,14 +581,33 @@ class Crawler {
571581
572582
573583 const title = await page . title ( ) ;
574- let text = "" ;
584+ let text = null ;
585+ let article = null ;
586+
575587 if ( this . params . text ) {
576588 const client = await page . target ( ) . createCDPSession ( ) ;
577589 const result = await client . send ( "DOM.getDocument" , { "depth" : - 1 , "pierce" : true } ) ;
578590 text = await new TextExtract ( result ) . parseTextFromDom ( ) ;
579591 }
580-
581- await this . writePage ( data . url , title , this . params . text , text ) ;
592+
593+ if ( this . params . readerView ) {
594+ article = { } ;
595+ try {
596+ // Note: DOM tree is cloned to avoid side effects
597+ // because it is modified by @mozilla /readability
598+ await page . exposeFunction ( "readabilityLog" , ( msg ) => console . log ( msg ) ) ;
599+ article = await page . evaluate ( `${ readabilityJs } ;\n(async () => {
600+ if (isProbablyReaderable(document)) {
601+ return await new Readability(document.cloneNode(true)).parse();
602+ } else {
603+ readabilityLog("Not readerable: " + document.URL);
604+ }})();` ) ;
605+ } catch ( e ) {
606+ console . log ( "Error applying reader view:" , e ) ;
607+ }
608+ }
609+
610+ await this . writePage ( data . url , title , text , article ) ;
582611
583612 if ( this . behaviorOpts ) {
584613 await Promise . allSettled ( page . frames ( ) . map ( frame => frame . evaluate ( "self.__bx_behaviors.run();" ) ) ) ;
@@ -792,14 +821,20 @@ class Crawler {
792821
793822 if ( createNew ) {
794823 const header = { "format" : "json-pages-1.0" , "id" : "pages" , "title" : "All Pages" } ;
824+ header [ "hasText" ] = this . params . text ;
825+ header [ "hasReaderView" ] = this . params . readerView ;
826+ let msg = "creating pages " ;
795827 if ( this . params . text ) {
796- console . log ( "creating pages with full text" ) ;
797- header [ "hasText" ] = true ;
798- }
799- else {
800- console . log ( "creating pages without full text" ) ;
801- header [ "hasText" ] = false ;
828+ msg += "with full text" ;
829+ if ( this . params . readerView ) {
830+ msg += " and reader view" ;
831+ }
832+ } else if ( this . params . readerView ) {
833+ msg += "with reader view" ;
834+ } else {
835+ msg += "without full text or reader view" ;
802836 }
837+ console . log ( msg ) ;
803838 const header_formatted = JSON . stringify ( header ) . concat ( "\n" ) ;
804839 await this . pagesFH . writeFile ( header_formatted ) ;
805840 }
@@ -809,14 +844,18 @@ class Crawler {
809844 }
810845 }
811846
812- async writePage ( url , title , text , text_content ) {
847+ async writePage ( url , title , text , article ) {
813848 const id = uuidv4 ( ) ;
814849 const row = { "id" : id , "url" : url , "title" : title } ;
815850
816- if ( text == true ) {
817- row [ "text" ] = text_content ;
851+ if ( text ) {
852+ row [ "text" ] = text ;
818853 }
819-
854+
855+ if ( article ) {
856+ row [ "article" ] = article ;
857+ }
858+
820859 const processedRow = JSON . stringify ( row ) . concat ( "\n" ) ;
821860 try {
822861 this . pagesFH . writeFile ( processedRow ) ;
0 commit comments