Refactor DeclutterService to a separate package.

Add WavenetService package Remove old dependencies Rename application due to new domain Add reading level and grade level calculation
davedavis · Feb 19, 2022 · 72094cf · 72094cf
1 parent 0233099
commit 72094cf
Show file tree

Hide file tree

Showing 9 changed files with 179 additions and 189 deletions.
diff --git a/Readme.md b/Readme.md
@@ -1,6 +1,6 @@
-# PDFit
+# Klutter
 
-PDFit.io is a webservice that accepts a URL from a user and returns
+Klutter.io is a webservice that accepts a URL from a user and returns
 a de-cluttered, ad-free and readability.js inspired simple format or
 PDF for use with e-ink and e-reader devices like the remarkable or
 kindle readers. 
@@ -25,7 +25,7 @@ https://davedavis.atlassian.net/jira/software/c/projects/PDFIT/boards/4/backlog
 
 
 ## Sprint 0 strategy
-- Build out the declutter service using the Jsoup and crux libraries.
+- Build out the declutter service using the Jsoup and r4j libraries.
 - Receive a URL in either the body or a URL parameter
 - Set up API V1
 - Add decluttered resource to the DB

diff --git a/pom.xml b/pom.xml
@@ -6,13 +6,15 @@
         <groupId>org.springframework.boot</groupId>
         <artifactId>spring-boot-starter-parent</artifactId>
         <version>2.6.3</version>
-        <relativePath/> <!-- lookup parent from repository -->
+        <relativePath/>
     </parent>
-    <groupId>io.pdfit</groupId>
-    <artifactId>readlater</artifactId>
+    <groupId>io.klutter</groupId>
+    <artifactId>Klutter</artifactId>
     <version>0.0.1-SNAPSHOT</version>
-    <name>pdfit</name>
-    <description>pdfit</description>
+    <name>Klutter</name>
+    <description>Klutter is a tool that removes clutter like ads, menus,
+        navigation, sidebars, headers and footers from articles and documents
+        and provides a more usable e-ink version in PDF format.</description>
     <properties>
         <java.version>11</java.version>
     </properties>
@@ -58,12 +60,7 @@
             <version>1.14.3</version>
         </dependency>
 
-        <!-- Java implementation of readability.js from Mozilla-->
-        <dependency>
-            <groupId>com.chimbori.crux</groupId>
-            <artifactId>crux</artifactId>
-            <version>3.0.1</version>
-        </dependency>
+
 
 
         <dependency>
@@ -80,7 +77,6 @@
             <artifactId>selenium-java</artifactId>
             <version>4.0.0</version>
         </dependency>
-        <!-- add belows for these dependencies version is not 4.0.0 when automatically generated -->
         <dependency>
             <groupId>org.seleniumhq.selenium</groupId>
             <artifactId>selenium-chrome-driver</artifactId>
@@ -97,17 +93,32 @@
             <version>4.0.0</version>
         </dependency>
 
-
+        <!--Mozilla Readability wrapper-->
         <dependency>
             <groupId>net.dankito.readability4j</groupId>
             <artifactId>readability4j</artifactId>
             <version>1.0.8</version>
         </dependency>
 
+        <!-- Flesch library for calculating reading difficulty.-->
+        <dependency>
+            <groupId>io.whelk.flesch.kincaid</groupId>
+            <artifactId>whelk-flesch-kincaid</artifactId>
+            <version>0.1.8</version>
+        </dependency>
 
 
 
 
+        <!--  Web driver manager so that I don't have to package an executable with the app.-->
+        <dependency>
+            <groupId>io.github.bonigarcia</groupId>
+            <artifactId>webdrivermanager</artifactId>
+            <version>5.1.0</version>
+        </dependency>
+
+
+
 
 
 

diff --git a/src/main/java/io/pdfit/PdfitApplication.java → ...n/java/io/klutter/KlutterApplication.java b/src/main/java/io/pdfit/PdfitApplication.java → ...n/java/io/klutter/KlutterApplication.java
@@ -1,15 +1,15 @@
-package io.pdfit;
+package io.klutter;
 
 import org.springframework.boot.SpringApplication;
 import org.springframework.boot.autoconfigure.SpringBootApplication;
 
 //Add the scanBasePackages parameter to the annotation as I added my services in
 //separate packages so, they need to be configured on application start.
-@SpringBootApplication(scanBasePackages = {"io.pdfit.declutterservice", "io.pdfit.pdfservice"} )
-public class PdfitApplication {
+@SpringBootApplication(scanBasePackages = {"io.klutter.declutterservice", "io.klutter.pdfservice"} )
+public class KlutterApplication {
 
     public static void main(String[] args) {
-        SpringApplication.run(PdfitApplication.class, args);
+        SpringApplication.run(KlutterApplication.class, args);
         System.out.println("Application Running: http://localhost:8080");
     }
 

diff --git a/src/main/java/io/klutter/declutterservice/DeclutterService.java b/src/main/java/io/klutter/declutterservice/DeclutterService.java
@@ -0,0 +1,79 @@
+package io.klutter.declutterservice;
+
+import io.github.bonigarcia.wdm.WebDriverManager;
+import io.whelk.flesch.kincaid.ReadabilityCalculator;
+import net.dankito.readability4j.Readability4J;
+import okhttp3.HttpUrl;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.safety.Safelist;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.chrome.ChromeOptions;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RestController;
+
+import java.io.IOException;
+
+@RestController
+@RequestMapping("/api/v1")
+public class DeclutterService {
+
+    @RequestMapping("/declutter")
+    public String index() throws IOException {
+
+        String url = "https://realpython.com/python-sockets/";
+
+        // Selenium. Using Selenium because jsoup doesn't handle JS and lazy loading.
+        // System.setProperty("webdriver.chrome.driver", "/home/dave/chromedriver");
+        ChromeOptions options = new ChromeOptions();options.addArguments("--headless");
+        //WebDriver driver = new ChromeDriver(options);
+
+        // Using Webdriver
+        WebDriverManager.chromedriver().setup();
+        WebDriver driver = new ChromeDriver(options);
+
+        // ToDo: Receive URL from frontend.
+        driver.get(url);
+
+        // Get the raw HTML source.
+        String html = driver.getPageSource();
+
+        // Parse with Jsoup, so we can work with it.;
+        Document doc = Jsoup.parse(html);
+
+        // ToDo: Do a bit of sanitization on the HTML before passing to the PDF service.
+        String safe = Jsoup.clean(doc.html(), Safelist.basic());
+
+        // Process with the readability4j mozilla readability.js wrapper.
+        Readability4J readability4J = new Readability4J(url, doc);
+        net.dankito.readability4j.Article article = readability4J.parse();
+
+        // returns extracted content in a <div> element
+        String extractedContentHtml = article.getContent();
+        // to get content wrapped in <html> tags and encoding set to UTF-8, see chapter 'Output encoding'
+        String extractedContentHtmlWithUtf8Encoding = article.getContentWithUtf8Encoding();
+        String extractedContentPlainText = article.getTextContent();
+        String title = article.getTitle();
+        String byline = article.getByline();
+        String excerpt = article.getExcerpt();
+
+
+        // Get the reading ease score.
+        double ease = ReadabilityCalculator.calculateReadingEase(extractedContentPlainText);
+
+        // Get the grade level score.
+        double grade = ReadabilityCalculator.calculateGradeLevel(extractedContentPlainText);
+
+        // Check it's working
+        System.out.println(ease + " " + grade);
+
+        // ToDo: Add user, title, excerpt, byline, content, ease, grade and tag array to model.
+
+        // Return the clean HTML
+        return extractedContentHtml;
+    }
+
+
+}
+
diff --git a/src/main/java/io/klutter/pdfservice/PdfService.java b/src/main/java/io/klutter/pdfservice/PdfService.java
@@ -0,0 +1,63 @@
+package io.klutter.pdfservice;
+import com.github.jhonnymertz.wkhtmltopdf.wrapper.Pdf;
+import com.github.jhonnymertz.wkhtmltopdf.wrapper.configurations.WrapperConfig;
+import com.github.jhonnymertz.wkhtmltopdf.wrapper.params.Param;
+import io.github.bonigarcia.wdm.WebDriverManager;
+import net.dankito.readability4j.Readability4J;
+import okhttp3.HttpUrl;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.safety.Safelist;
+import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
+
+import org.openqa.selenium.chrome.ChromeOptions;
+import org.openqa.selenium.firefox.FirefoxDriver;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RestController;
+
+import java.io.*;
+
+@RestController
+@RequestMapping("/api/v1")
+public class PdfService {
+
+
+
+    @RequestMapping("/pdf")
+
+    public String index(){
+
+        String url = "https://realpython.com/python-sockets/";
+        Document doc = null;
+        try {
+            doc = Jsoup.connect("https://realpython.com/python-sockets/").get();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+
+        Pdf pdf = new Pdf();
+
+        // pdf.addPageFromString(doc.toString());
+        pdf.addPageFromUrl("https://realpython.com/python-sockets/");
+
+        // Add a Table of Contents
+        pdf.addToc();
+        pdf.addParam(new Param("--disable-javascript"));
+
+        // Add styling for Table of Contents
+        //        pdf.addTocParam(new Param("--xsl-style-sheet", "my_toc.xsl"));
+
+        // Save the PDF
+        try {
+            pdf.saveAs("output.pdf");
+        } catch (IOException e) {
+            e.printStackTrace();
+        } catch (InterruptedException e) {
+            e.printStackTrace();
+        }
+
+        return "Hello from another mapping";
+    }
+}
+
diff --git a/src/main/java/io/klutter/wavenetservice/WavenetService.java b/src/main/java/io/klutter/wavenetservice/WavenetService.java
@@ -0,0 +1,4 @@
+package io.klutter.wavenetservice;
+
+public class WavenetService {
+}
diff --git a/src/main/java/io/pdfit/declutterservice/DeclutterService.java b/src/main/java/io/pdfit/declutterservice/DeclutterService.java