Skip to content

Commit

Permalink
chg: [Filter unsafe onion] add a new unsafe onion filter option
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Feb 6, 2025
1 parent 98652a1 commit f01cfe7
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 37 deletions.
97 changes: 66 additions & 31 deletions bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from lib import ail_logger
from lib import crawlers
from lib.ConfigLoader import ConfigLoader
from lib.exceptions import TimeoutException
from lib.exceptions import TimeoutException, OnionFilteringError
from lib.Tag import get_domain_vanity_tags
from lib.objects import CookiesNames
from lib.objects import Etags
Expand Down Expand Up @@ -57,6 +57,9 @@ def __init__(self):

config_loader = ConfigLoader()

self.filter_unsafe_onion = crawlers.is_onion_filter_enabled(cache=False)
self.last_config_check = int(time.time())

self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')
Expand Down Expand Up @@ -139,11 +142,31 @@ def get_message(self):
if not self.is_lacus_up:
return None

# Refresh Config
if int(time.time()) - self.last_config_check > 60:
self.filter_unsafe_onion = crawlers.is_onion_filter_enabled()
self.last_config_check = int(time.time())

# Check if a new Capture can be Launched
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
task_row = crawlers.add_task_to_lacus_queue()
if task_row:
task, priority = task_row
domain = task.get_domain()
if self.filter_unsafe_onion:
if domain.endswith('.onion'):
try:
if not crawlers.check_if_onion_is_safe(domain):
# print('---------------------------------------------------------')
# print('DOMAIN FILTERED')
task.delete()
return None
except OnionFilteringError:
task.reset()
self.logger.warning(f'Onion Filtering Connection Error, {task.uuid} Send back in queue')
time.sleep(10)
return None

task.start()
task_uuid = task.uuid
try:
Expand Down Expand Up @@ -301,41 +324,46 @@ def compute(self, capture):
self.root_item = None

# Save Capture
self.save_capture_response(parent_id, entries)

if self.parent != 'lookup':
# Update domain first/last seen
self.domain.update_daterange(self.date.replace('/', ''))
# Origin + History + tags
if self.root_item:
self.domain.set_last_origin(parent_id)
# Vanity
self.domain.update_vanity_cluster()
domain_vanity = self.domain.get_vanity()
if domain_vanity in self.vanity_tags:
for tag in self.vanity_tags[domain_vanity]:
self.domain.add_tag(tag)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
# Crawler stats
self.domain.add_history(epoch, root_item=self.root_item)

if self.domain != self.original_domain:
self.original_domain.update_daterange(self.date.replace('/', ''))
saved = self.save_capture_response(parent_id, entries)
if saved:
if self.parent != 'lookup':
# Update domain first/last seen
self.domain.update_daterange(self.date.replace('/', ''))
# Origin + History + tags
if self.root_item:
self.original_domain.set_last_origin(parent_id)
self.domain.set_last_origin(parent_id)
# Vanity
self.domain.update_vanity_cluster()
domain_vanity = self.domain.get_vanity()
if domain_vanity in self.vanity_tags:
for tag in self.vanity_tags[domain_vanity]:
self.domain.add_tag(tag)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
self.original_domain.add_history(epoch, root_item=self.root_item)
# crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)

crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
print('capture:', capture.uuid, 'completed')
print('task: ', task.uuid, 'completed')
print()
# Crawler stats
self.domain.add_history(epoch, root_item=self.root_item)

if self.domain != self.original_domain:
self.original_domain.update_daterange(self.date.replace('/', ''))
if self.root_item:
self.original_domain.set_last_origin(parent_id)
# Tags
for tag in task.get_tags():
self.domain.add_tag(tag)
self.original_domain.add_history(epoch, root_item=self.root_item)
# crawlers.update_last_crawled_domain(self.original_domain.get_domain_type(), self.original_domain.id, epoch)

crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
print('capture:', capture.uuid, 'completed')
print('task: ', task.uuid, 'completed')
print()
else:
print('capture:', capture.uuid, 'Unsafe Content Filtered')
print('task: ', task.uuid, 'Unsafe Content Filtered')
print()
task.remove()
self.root_item = None

def save_capture_response(self, parent_id, entries):
print(entries.keys())
Expand All @@ -357,6 +385,12 @@ def save_capture_response(self, parent_id, entries):
print(f'External redirection {self.domain.id} -> {current_domain}')
if not self.root_item:
self.domain = Domain(current_domain)
# Filter Domain
if self.filter_unsafe_onion:
if current_domain.endswith('.onion'):
if not crawlers.check_if_onion_is_safe(current_domain):
return False

# TODO LAST URL
# FIXME
else:
Expand Down Expand Up @@ -449,6 +483,7 @@ def save_capture_response(self, parent_id, entries):
if entries_children:
for children in entries_children:
self.save_capture_response(parent_id, children)
return True


if __name__ == '__main__':
Expand Down
88 changes: 82 additions & 6 deletions bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,14 @@
from packages import git_status
from packages import Date
from lib import ail_orgs
from lib.exceptions import OnionFilteringError
from lib.ConfigLoader import ConfigLoader
from lib.regex_helper import regex_findall
from lib.objects.Domains import Domain
from lib.objects.Titles import Title
from lib.objects import HHHashs
from lib.objects.Items import Item
from lib import Tag

config_loader = ConfigLoader()
r_db = config_loader.get_db_conn("Kvrocks_DB")
Expand Down Expand Up @@ -2269,13 +2271,87 @@ def test_ail_crawlers():

#### ---- ####

# TODO CHECK MIGRATION - Rest API

# TODO MIGRATE ME
# def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
# # validate url
# if url is None or url=='' or url=='\n':
# return ({'error':'invalid depth limit'}, 400)
# # # # # # # # # # # # #
# #
# CONTENT FILTERING #
# #
# # # # # # # # # # # # #

def _onion_lookup(onion_url):
try:
commit_id = git_status.get_last_commit_id_from_local()
user_agent = f'AIL-{commit_id}'
headers = {'User-Agent': user_agent}
response = requests.get(f'https://onion.ail-project.org/api/lookup/{onion_url}', timeout=10, headers=headers)
if response.status_code == 200:
json_response = response.json()
return json_response
else:
print(response)
return {'error': f'{response.status_code}'}
except requests.exceptions.ConnectionError:
return {'error': f'Connection Error'}
except requests.exceptions.ReadTimeout:
return {'error': f'Timeout Error'}


def check_if_onion_is_safe(onion_url):
resp = _onion_lookup(onion_url)
if resp:
if isinstance(resp, dict):
if 'tags' in resp:
return Tag.is_tags_safe(resp['tags'])
elif 'error' in resp:
if resp['error']:
raise OnionFilteringError(resp['error'])
return False


def _is_onion_filter_enabled():
enabled = r_crawler.hget('crawler:onion_filter', 'enabled')
if enabled is None:
r_crawler.hset('crawler:onion_filter', 'enabled', str(True))
filter_enabled = True
else:
filter_enabled = enabled == 'True'
r_cache.set('crawler:onion_filter:state', str(filter_enabled))
return filter_enabled

def is_onion_filter_enabled(cache=True):
if cache:
res = r_cache.get('crawler:onion_filter:state')
if res is None:
enabled = _is_onion_filter_enabled()
r_cache.set('crawler:onion_filter:state', str(enabled))
return enabled
else:
return res == 'True'
else:
return _is_onion_filter_enabled()

def get_onion_filter_last_update_time():
last_update_time = r_cache.get('crawler:onion_filter:last_update_time')
if not last_update_time:
last_update_time = r_crawler.hget('crawler:onion_filter', 'update_time')
if not last_update_time:
last_update_time = 0
last_update_time = float(last_update_time)
r_cache.set('crawler:onion_filter:last_update_time', last_update_time)
return float(last_update_time)

def change_onion_filter_state(new_state):
old_state = is_onion_filter_enabled(cache=False)
if old_state != new_state:
r_crawler.hset('crawler:onion_filter', 'enabled', str(new_state))
r_cache.set('crawler:onion_filter:state', str(new_state))
update_time = time.time()
r_crawler.hset('crawler:onion_filter', 'update_time', update_time)
r_cache.set('crawler:onion_filter:last_update_time', update_time)
return True
return False

#### ---- ####


# TODO MOVE ME IN CRAWLER OR FLASK
Expand Down
3 changes: 3 additions & 0 deletions bin/lib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ class MISPConnectionError(AILError):

class AILObjectUnknown(AILError):
pass

class OnionFilteringError(AILError):
pass
17 changes: 17 additions & 0 deletions var/www/blueprints/crawler_splash.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,6 +996,8 @@ def crawler_settings():
is_crawler_working = crawlers.is_test_ail_crawlers_successful()
crawler_error_mess = crawlers.get_test_ail_crawlers_message()

is_onion_filter_enabled = crawlers.is_onion_filter_enabled(cache=False)

# TODO REGISTER PROXY
# all_proxies = crawlers.get_all_proxies_metadata()

Expand All @@ -1008,6 +1010,7 @@ def crawler_settings():
# all_proxies=all_proxies,
is_crawler_working=is_crawler_working,
crawler_error_mess=crawler_error_mess,
is_onion_filter_enabled=is_onion_filter_enabled,
)


Expand Down Expand Up @@ -1054,4 +1057,18 @@ def crawler_settings_crawler_test():
crawlers.test_ail_crawlers()
return redirect(url_for('crawler_splash.crawler_settings'))

@crawler_splash.route('/crawler/settings/crawler/filter_unsafe_onion', methods=['GET'])
@login_required
@login_admin
def crawler_filter_unsafe_onion():
filter_unsafe_onion = request.args.get('state')
if filter_unsafe_onion == 'enable':
filter_unsafe_onion = True
else:
filter_unsafe_onion = False
print(filter_unsafe_onion)
crawlers.change_onion_filter_state(filter_unsafe_onion)
return redirect(url_for('crawler_splash.crawler_settings'))


# --- LACUS ---#
31 changes: 31 additions & 0 deletions var/www/templates/crawler/crawler_splash/settings_crawler.html
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,37 @@ <h5 class="card-title">Crawlers Settings:</h5>
</div>
</div> -->

<div class="card border-secondary my-4">
<div class="card-body text-dark">
<h5 class="card-title">
Filter Unsafe Onion: &nbsp;&nbsp;<b class="text-primary"><span class="text-{% if is_onion_filter_enabled %}success{% else %}danger{% endif %}">{{ is_onion_filter_enabled }}</span></b>
</h5>
<p>
This option enables filtering of onion domains that are considered unsafe due to containing violent content, child sexual abuse material (CSAM), or other harmful materials. When enabled, the system will attempt to identify and exclude such domains from crawling.<br><br>

<span class="text-danger"><i class="fa-solid fa-triangle-exclamation fa-2x"></i></span><b> Disabling this option may result in crawling and downloading content that includes CSAM, extreme violence, or other harmful materials.</b><br> Users are strongly advised to keep this feature enabled to avoid unintentional exposure to such content.<br><br>

🔍 How It Works: The filtering mechanism leverages known blocklists, heuristics, and automated detection techniques to reduce the risk of crawling unsafe content. While no filtering system is perfect, we continuously strive to improve detection and minimize exposure to harmful materials.<br><br>

By using this feature, you benefit from an added layer of protection, but please note that some unsafe onion domains may still bypass detection due to evolving content and obfuscation techniques.<br>
We encourage users to remain cautious and use this feature as an additional safeguard.
</p>
{% if is_onion_filter_enabled %}
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=disable">
<button class="btn btn-danger mx-4 my-2">
<i class="fa-solid fa-xmark"></i> Disable Onion Filter
</button>
</a>
{% else %}
<a href="{{ url_for('crawler_splash.crawler_filter_unsafe_onion') }}?state=enable">
<button class="btn btn-success my-2">
<i class="fa-solid fa-check"></i> Enable Onion Filter
</button>
</a>
{% endif %}
</div>
</div>

<a href="{{ url_for('crawler_splash.crawler_blacklist') }}">
<button type="button" class="btn btn-outline-danger">Blacklisted domains</button>
</a>
Expand Down

0 comments on commit f01cfe7

Please sign in to comment.