Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

修正幾つか #3

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion web-scraping/author.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_author_id(self):

def _set_author_id(self):
author_id = self.db_uploader.select_articles_authors(self.name)
print author_id
#print author_id
if author_id is None:
return self.db_uploader.insert_articles_authors(self.name)

Expand Down
9 changes: 8 additions & 1 deletion web-scraping/db_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,18 @@ def insert_articles_articles(self, content):

return articles_id

def select_articles_articles_by_url(self, url):
with self.conn.cursor() as cur:
sql = "SELECT id from articles_articles where url = '{0}'".format(url)
cur.execute(sql)
res = cur.fetchone()
return res

def select_articles_authors(self, name):
with self.conn.cursor() as cur:
name_hash = self.name_hash(name)
sql = "SELECT id FROM articles_authors where name_hash = '{0}'".format(name_hash)
print sql
#print sql
cur.execute(sql)
return cur.fetchone()

Expand Down
12 changes: 9 additions & 3 deletions web-scraping/reuters_html2content.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@ def __init__(self, conn):
self.conn = conn

def parse(self, full_url, soup):
author = Author(self.get_author_name(soup), self.conn)
author_name = self.get_author_name(soup)
if author_name is None:
return None
author = Author(author_name, self.conn)
author_id = author.get_author_id()
if author_id is None:
return None
return Content(
author.get_author_id(),
self.get_article_text(soup),
Expand All @@ -35,7 +41,7 @@ def get_article_text(self, soup):
try:
return "\n".join(map(lambda x: x.text, article_text.find_all("p")))
except AttributeError:
print article_text
#print article_text
return article_text

def get_revision_date(self, soup):
Expand All @@ -59,5 +65,5 @@ def parse_time(self, tstr):
html2content = ReutersHtml2Content()
url = "http://jp.reuters.com/article/idJP2017020301002019?sp=true"
soup = scraper.get_sorp(url);
print html2content.parse(url, soup)
#print html2content.parse(url, soup)

7 changes: 5 additions & 2 deletions web-scraping/reuters_jp_columns_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,8 @@
full_url = ReutersJpColummsScraper.get_full_url(url)
soup = scraper.get_soup(full_url)
content = html2content.parse(full_url, soup)
writer.write_articles_file(content)
page += 1
if content:
writer.write_articles_file(content)
else:
print("content registration error")
scraper.load_more_content()
4 changes: 2 additions & 2 deletions web-scraping/reuters_jp_columns_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, log_path):

def get_target_url(self):
url = self.BASE_URL.format(self.page)
print url
#print url
return url

def get_soup(self, url):
Expand All @@ -38,4 +38,4 @@ def get_full_url(cls, article_path):
## test
if __name__ == '__main__':
scraper = ReutersJpColummsScraper(ReutersJpColummsScraper.LOG_PATH)
print scraper.get_url_list()
#print scraper.get_url_list()
2 changes: 1 addition & 1 deletion web-scraping/reuters_news_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
content = html2content.parse(full_url, soup)
#writer.replace_author(content)
writer.write_articles_file(content)
scraper.load_more_content()
scraper.load_more_content()
6 changes: 3 additions & 3 deletions web-scraping/reuters_the_wire_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_sorp(self):

def get_url_list(self):
href_list = self.get_sorp().find_all("a", href=self.RE_ARTICLE)
url_list = map(lambda x: x.get("href"), href_list)
url_list = list(map(lambda x: x.get("href"), href_list))
if(len(url_list) > 20):
del url_list[0:19]
return url_list
Expand All @@ -37,5 +37,5 @@ def get_full_url(cls, article_path):
scraper = ReutersTheWireScraper(ReutersTheWireScraper.LOG_PATH)
scraper.load_more_content()
scraper.load_more_content()
print scraper.get_url_list()
print len(scraper.get_url_list())
#print scraper.get_url_list()
#print len(scraper.get_url_list())
4 changes: 2 additions & 2 deletions web-scraping/scraping_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_latest_sorp(self):
return self.create_soup(markup)

def get_markup_by_driver(self, url):
print url
#print url
self.driver.get(url)
source = self.driver.page_source
return source.encode("utf-8")
Expand All @@ -69,4 +69,4 @@ class ScrapingLibException(BaseException):
## test
if __name__ == '__main__':
scraper = ScrapingLib()
print scraper.get_sorp("http://sumodb.sumogames.de/Results_text.aspx?b=201509&d=9")
#print scraper.get_sorp("http://sumodb.sumogames.de/Results_text.aspx?b=201509&d=9")
2 changes: 1 addition & 1 deletion web-scraping/toyokeizai_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@
content = html2content.parse(full_url, soup)
#writer.replace_author(content)
writer.write_articles_file(content)
page += 1
scraper.load_more_content() #page += 1
3 changes: 3 additions & 0 deletions web-scraping/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def __init__(self, conn):
self.uploader = DbUploader(conn)

def write_articles_file(self, content):
content_id = self.uploader.select_articles_articles_by_url(content.url)
if content_id is not None:
return False
articles_id = self.uploader.insert_articles_articles(content)
if articles_id is None:
return False
Expand Down