Skip to content

Commit

Permalink
Refactor DeclutterService to a separate package.
Browse files Browse the repository at this point in the history
Add WavenetService package
Remove old dependencies
Rename application due to new domain
Add reading level and grade level calculation
  • Loading branch information
davedavis committed Feb 19, 2022
1 parent 0233099 commit 72094cf
Show file tree
Hide file tree
Showing 9 changed files with 179 additions and 189 deletions.
6 changes: 3 additions & 3 deletions Readme.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# PDFit
# Klutter

PDFit.io is a webservice that accepts a URL from a user and returns
Klutter.io is a webservice that accepts a URL from a user and returns
a de-cluttered, ad-free and readability.js inspired simple format or
PDF for use with e-ink and e-reader devices like the remarkable or
kindle readers.
Expand All @@ -25,7 +25,7 @@ https://davedavis.atlassian.net/jira/software/c/projects/PDFIT/boards/4/backlog


## Sprint 0 strategy
- Build out the declutter service using the Jsoup and crux libraries.
- Build out the declutter service using the Jsoup and r4j libraries.
- Receive a URL in either the body or a URL parameter
- Set up API V1
- Add decluttered resource to the DB
Expand Down
37 changes: 24 additions & 13 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.6.3</version>
<relativePath/> <!-- lookup parent from repository -->
<relativePath/>
</parent>
<groupId>io.pdfit</groupId>
<artifactId>readlater</artifactId>
<groupId>io.klutter</groupId>
<artifactId>Klutter</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>pdfit</name>
<description>pdfit</description>
<name>Klutter</name>
<description>Klutter is a tool that removes clutter like ads, menus,
navigation, sidebars, headers and footers from articles and documents
and provides a more usable e-ink version in PDF format.</description>
<properties>
<java.version>11</java.version>
</properties>
Expand Down Expand Up @@ -58,12 +60,7 @@
<version>1.14.3</version>
</dependency>

<!-- Java implementation of readability.js from Mozilla-->
<dependency>
<groupId>com.chimbori.crux</groupId>
<artifactId>crux</artifactId>
<version>3.0.1</version>
</dependency>



<dependency>
Expand All @@ -80,7 +77,6 @@
<artifactId>selenium-java</artifactId>
<version>4.0.0</version>
</dependency>
<!-- add belows for these dependencies version is not 4.0.0 when automatically generated -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-chrome-driver</artifactId>
Expand All @@ -97,17 +93,32 @@
<version>4.0.0</version>
</dependency>


<!--Mozilla Readability wrapper-->
<dependency>
<groupId>net.dankito.readability4j</groupId>
<artifactId>readability4j</artifactId>
<version>1.0.8</version>
</dependency>

<!-- Flesch library for calculating reading difficulty.-->
<dependency>
<groupId>io.whelk.flesch.kincaid</groupId>
<artifactId>whelk-flesch-kincaid</artifactId>
<version>0.1.8</version>
</dependency>




<!-- Web driver manager so that I don't have to package an executable with the app.-->
<dependency>
<groupId>io.github.bonigarcia</groupId>
<artifactId>webdrivermanager</artifactId>
<version>5.1.0</version>
</dependency>






Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
package io.pdfit;
package io.klutter;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

//Add the scanBasePackages parameter to the annotation as I added my services in
//separate packages so, they need to be configured on application start.
@SpringBootApplication(scanBasePackages = {"io.pdfit.declutterservice", "io.pdfit.pdfservice"} )
public class PdfitApplication {
@SpringBootApplication(scanBasePackages = {"io.klutter.declutterservice", "io.klutter.pdfservice"} )
public class KlutterApplication {

public static void main(String[] args) {
SpringApplication.run(PdfitApplication.class, args);
SpringApplication.run(KlutterApplication.class, args);
System.out.println("Application Running: http://localhost:8080");
}

Expand Down
79 changes: 79 additions & 0 deletions src/main/java/io/klutter/declutterservice/DeclutterService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package io.klutter.declutterservice;

import io.github.bonigarcia.wdm.WebDriverManager;
import io.whelk.flesch.kincaid.ReadabilityCalculator;
import net.dankito.readability4j.Readability4J;
import okhttp3.HttpUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Safelist;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.IOException;

@RestController
@RequestMapping("/api/v1")
public class DeclutterService {

@RequestMapping("/declutter")
public String index() throws IOException {

String url = "https://realpython.com/python-sockets/";

// Selenium. Using Selenium because jsoup doesn't handle JS and lazy loading.
// System.setProperty("webdriver.chrome.driver", "/home/dave/chromedriver");
ChromeOptions options = new ChromeOptions();options.addArguments("--headless");
//WebDriver driver = new ChromeDriver(options);

// Using Webdriver
WebDriverManager.chromedriver().setup();
WebDriver driver = new ChromeDriver(options);

// ToDo: Receive URL from frontend.
driver.get(url);

// Get the raw HTML source.
String html = driver.getPageSource();

// Parse with Jsoup, so we can work with it.;
Document doc = Jsoup.parse(html);

// ToDo: Do a bit of sanitization on the HTML before passing to the PDF service.
String safe = Jsoup.clean(doc.html(), Safelist.basic());

// Process with the readability4j mozilla readability.js wrapper.
Readability4J readability4J = new Readability4J(url, doc);
net.dankito.readability4j.Article article = readability4J.parse();

// returns extracted content in a <div> element
String extractedContentHtml = article.getContent();
// to get content wrapped in <html> tags and encoding set to UTF-8, see chapter 'Output encoding'
String extractedContentHtmlWithUtf8Encoding = article.getContentWithUtf8Encoding();
String extractedContentPlainText = article.getTextContent();
String title = article.getTitle();
String byline = article.getByline();
String excerpt = article.getExcerpt();


// Get the reading ease score.
double ease = ReadabilityCalculator.calculateReadingEase(extractedContentPlainText);

// Get the grade level score.
double grade = ReadabilityCalculator.calculateGradeLevel(extractedContentPlainText);

// Check it's working
System.out.println(ease + " " + grade);

// ToDo: Add user, title, excerpt, byline, content, ease, grade and tag array to model.

// Return the clean HTML
return extractedContentHtml;
}


}

63 changes: 63 additions & 0 deletions src/main/java/io/klutter/pdfservice/PdfService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package io.klutter.pdfservice;
import com.github.jhonnymertz.wkhtmltopdf.wrapper.Pdf;
import com.github.jhonnymertz.wkhtmltopdf.wrapper.configurations.WrapperConfig;
import com.github.jhonnymertz.wkhtmltopdf.wrapper.params.Param;
import io.github.bonigarcia.wdm.WebDriverManager;
import net.dankito.readability4j.Readability4J;
import okhttp3.HttpUrl;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Safelist;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;

import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.*;

@RestController
@RequestMapping("/api/v1")
public class PdfService {



@RequestMapping("/pdf")

public String index(){

String url = "https://realpython.com/python-sockets/";
Document doc = null;
try {
doc = Jsoup.connect("https://realpython.com/python-sockets/").get();
} catch (IOException e) {
e.printStackTrace();
}

Pdf pdf = new Pdf();

// pdf.addPageFromString(doc.toString());
pdf.addPageFromUrl("https://realpython.com/python-sockets/");

// Add a Table of Contents
pdf.addToc();
pdf.addParam(new Param("--disable-javascript"));

// Add styling for Table of Contents
// pdf.addTocParam(new Param("--xsl-style-sheet", "my_toc.xsl"));

// Save the PDF
try {
pdf.saveAs("output.pdf");
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}

return "Hello from another mapping";
}
}

4 changes: 4 additions & 0 deletions src/main/java/io/klutter/wavenetservice/WavenetService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package io.klutter.wavenetservice;

public class WavenetService {
}
34 changes: 0 additions & 34 deletions src/main/java/io/pdfit/declutterservice/DeclutterService.java

This file was deleted.

Loading

0 comments on commit 72094cf

Please sign in to comment.