Skip to content

Commit

Permalink
fix: [crawler] title extraction, sigalarm raised by signal.alarm and …
Browse files Browse the repository at this point in the history
…sleep
  • Loading branch information
Terrtia committed Jan 9, 2025
1 parent 9425e01 commit 109ce56
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 29 deletions.
31 changes: 28 additions & 3 deletions bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from lib import ail_logger
from lib import crawlers
from lib.ConfigLoader import ConfigLoader
from lib.exceptions import TimeoutException
from lib.Tag import get_domain_vanity_tags
from lib.objects import CookiesNames
from lib.objects import Etags
Expand All @@ -30,6 +31,15 @@

logging.config.dictConfig(ail_logger.get_config(name='crawlers'))

# SIGNAL ALARM
import signal
def timeout_handler(signum, frame):
raise TimeoutException


signal.signal(signal.SIGALRM, timeout_handler)


class Crawler(AbstractModule):

def __init__(self):
Expand Down Expand Up @@ -104,7 +114,10 @@ def refresh_lacus_status(self):
self.is_lacus_up = False
if not self.is_lacus_up:
print("Can't reach lacus server", int(time.time()))
time.sleep(30)
try:
time.sleep(30)
except TimeoutException:
pass

def print_crawler_start_info(self, url, domain_url):
print()
Expand Down Expand Up @@ -183,7 +196,10 @@ def get_message(self):
capture.update(-1)
self.refresh_lacus_status()

time.sleep(self.pending_seconds)
try:
time.sleep(self.pending_seconds)
except TimeoutException:
pass

def enqueue_capture(self, task_uuid, priority):
task = crawlers.CrawlerTask(task_uuid)
Expand Down Expand Up @@ -364,7 +380,16 @@ def save_capture_response(self, parent_id, entries):
dom_hash.add(self.date.replace('/', ''), item)
dom_hash.add_correlation('domain', '', self.domain.id)

title_content = crawlers.extract_title_from_html(entries['html'], item_id)
# TITLE
signal.alarm(60)
try:
title_content = crawlers.extract_title_from_html(entries['html'])
except TimeoutException:
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
title_content = None
else:
signal.alarm(0)

if title_content:
title = Titles.create_title(title_content)
title.add(item.get_date(), item)
Expand Down
26 changes: 3 additions & 23 deletions bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,6 @@

from pyfaup.faup import Faup


import signal

class TimeoutException(Exception):
pass

def timeout_handler(signum, frame):
raise TimeoutException


signal.signal(signal.SIGALRM, timeout_handler)


# interact with splash_crawler API
import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
Expand Down Expand Up @@ -73,7 +60,7 @@ def timeout_handler(signum, frame):

faup = Faup()

logger_crawler = logging.getLogger('crawlers.log')
# logger_crawler = logging.getLogger('crawlers.log')

# # # # # # # #
# #
Expand Down Expand Up @@ -325,21 +312,14 @@ def extract_favicon_from_html(html, url):
# #
# # # # # # # #

def extract_title_from_html(html, item_id):
# signal.alarm(60)
# try:
# /!\ REQUIRE ALARM SIGNAL
def extract_title_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
title = soup.title
if title:
title = title.string
if title:
return str(title)
# except TimeoutException:
# signal.alarm(0)
# logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
# else:
# signal.alarm(0)
# signal.alarm(0)
return ''

def extract_description_from_html(html):
Expand Down
6 changes: 5 additions & 1 deletion bin/lib/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#!/usr/bin/env python3
# -*-coding:UTF-8 -*

from pymisp import PyMISPError
# from pymisp import PyMISPError

# SIGNAL ALARM
class TimeoutException(Exception):
pass

class AILError(Exception):
def __init__(self, message):
Expand Down
7 changes: 5 additions & 2 deletions bin/modules/abstract_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from lib import ail_logger
from lib.ail_queues import AILQueue
from lib import regex_helper
from lib.exceptions import ModuleQueueError
from lib.exceptions import ModuleQueueError, TimeoutException
from lib.objects.ail_objects import get_obj_from_global_id

logging.config.dictConfig(ail_logger.get_config(name='modules'))
Expand Down Expand Up @@ -193,7 +193,10 @@ def run(self):
self.computeNone()
# Wait before next process
self.logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s")
time.sleep(self.pending_seconds)
try:
time.sleep(self.pending_seconds)
except TimeoutException:
pass

def _module_name(self):
"""
Expand Down

0 comments on commit 109ce56

Please sign in to comment.