Skip to content

Commit

Permalink
refactor: get_reviews generator
Browse files Browse the repository at this point in the history
  • Loading branch information
ernix committed Aug 18, 2019
1 parent c09be8f commit 2053bb0
Showing 1 changed file with 28 additions and 24 deletions.
52 changes: 28 additions & 24 deletions eigacom_review/eigacom_review_scraping.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import csv
import itertools
import logging
import requests
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -46,8 +47,32 @@ def search(query):
return None


def get_reviews(url):
for page_num in itertools.count(1):
res = requests.get(concat_url_path(url, page_num))
res.encoding = res.apparent_encoding

soup = BeautifulSoup(res.content, "lxml")
reviews = soup.select('div.user-review')

# ページ数の上限を超えたら
if not reviews:
break

for review in reviews:
title = review.select_one('h2.review-title a')
main_text = review.select_one('div.txt-block')
tgl_btn = main_text.select_one('div.toggle-btn')

if tgl_btn:
tgl_btn.decompose()

yield title.text + "\n" + main_text.text.replace("\n", "")

time.sleep(1)


def scrape_review(query):
page_num = 1
data = {
"id": -1,
"reviews": {
Expand All @@ -66,30 +91,9 @@ def scrape_review(query):
logging.warning("**************************************************")
return None

while(1):
res = requests.get(concat_url_path(url_review, page_num))
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.content, "lxml")
reviews = soup.select('div.user-review')

# ページ数の上限を超えたら
if not reviews:
print('DONE : ' + query)
break
data["reviews"]["eigacom"].extend(get_reviews(url_review))
print('DONE : ' + query)

for review in reviews:
title = review.select_one('h2.review-title a')
main_text = review.select_one('div.txt-block')
tgl_btn = main_text.select_one('div.toggle-btn')

if tgl_btn:
tgl_btn.decompose()

desc = title.text + "\n" + main_text.text.replace("\n", "")
data["reviews"]["eigacom"].append(desc)

page_num += 1
time.sleep(1)
return data


Expand Down

0 comments on commit 2053bb0

Please sign in to comment.