Skip to content

Commit

Permalink
not stucked by failed downloading
Browse files Browse the repository at this point in the history
  • Loading branch information
yindaheng98 committed Dec 15, 2023
1 parent 4746145 commit 79e376e
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 5 deletions.
9 changes: 8 additions & 1 deletion dblp_crawler/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,14 @@ def filter_publications_at_crawler(publications, year: int, keywords: Keywords):


async def bfs_to_end(graph, limit: int = 0):
while max(*(await graph.bfs_once())) > 0 and (limit != 0):
while limit != 0:
remain_none, total_author_count, all_fail = await graph.bfs_once()
if all_fail:
logger.info("Downloading all failed, exit")
return
if (remain_none + total_author_count) <= 0:
logger.info("No more need downloaded, exit")
return
logger.info("Still running......")
limit -= 1

Expand Down
10 changes: 7 additions & 3 deletions dblp_crawler/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def filter_publications_at_output(self, publications: Iterable[Publication]) ->
async def download_person(self, pid: str) -> None:
data = await download_person(pid)
if data is None:
return
return False
self.persons[pid] = DBLPPerson(data)
return True

async def bfs_once(self) -> tuple[int, int]:
if not self.journals_inited:
Expand Down Expand Up @@ -101,15 +102,18 @@ async def bfs_once(self) -> tuple[int, int]:
total_author_count += author_count
total_publication_count += publication_count
logger.info("%d authors from %d publications is fetching" % (total_author_count, total_publication_count))
await asyncio.gather(*tasks)
success = await asyncio.gather(*tasks)
all_fail = len(success) <= 0 or True not in success
logger.info("%d authors added from %d publications" % (total_author_count, total_publication_count))
if all_fail:
logger.warn("%d download_person all failed in this loop" % all_fail)
remain_none = 0
for person in self.persons.values():
if person is None:
remain_none += 1
logger.info("There are %d authors need init in next loop" % remain_none)
logger.info("There are %d authors need publications fetching in next loop" % total_author_count)
return remain_none, total_author_count
return remain_none, total_author_count, all_fail

@abc.abstractmethod
def summarize_person(self, a: str, person: Optional[DBLPPerson]) -> None: # 构建summary
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

setup(
name='dblp_crawler',
version='1.8.8',
version='1.8.9',
author='yindaheng98',
author_email='[email protected]',
url='https://github.com/yindaheng98/dblp-crawler',
Expand Down

0 comments on commit 79e376e

Please sign in to comment.