33"""
44
55import re
6+ import json
67from urllib .parse import urljoin
78
89from bs4 import BeautifulSoup , Comment
910from minify_html import minify
1011
1112
13+ def extract_from_script_tags (soup ):
14+ script_content = []
15+
16+ for script in soup .find_all ("script" ):
17+ content = script .string
18+ if content :
19+ try :
20+ json_pattern = r'(?:const|let|var)?\s*\w+\s*=\s*({[\s\S]*?});?$'
21+ json_matches = re .findall (json_pattern , content )
22+
23+ for potential_json in json_matches :
24+ try :
25+ parsed = json .loads (potential_json )
26+ if parsed :
27+ script_content .append (f"JSON data from script: { json .dumps (parsed , indent = 2 )} " )
28+ except json .JSONDecodeError :
29+ pass
30+
31+ if "window." in content or "document." in content :
32+ data_pattern = r'(?:window|document)\.(\w+)\s*=\s*([^;]+);'
33+ data_matches = re .findall (data_pattern , content )
34+
35+ for var_name , var_value in data_matches :
36+ script_content .append (f"Dynamic data - { var_name } : { var_value .strip ()} " )
37+ except Exception :
38+ if len (content ) < 1000 :
39+ script_content .append (f"Script content: { content .strip ()} " )
40+
41+ return "\n \n " .join (script_content )
42+
43+
1244def cleanup_html (html_content : str , base_url : str ) -> str :
1345 """
1446 Processes HTML content by removing unnecessary tags,
@@ -34,8 +66,10 @@ def cleanup_html(html_content: str, base_url: str) -> str:
3466
3567 title_tag = soup .find ("title" )
3668 title = title_tag .get_text () if title_tag else ""
37-
38- for tag in soup .find_all (["script" , "style" ]):
69+
70+ script_content = extract_from_script_tags (soup )
71+
72+ for tag in soup .find_all ("style" ):
3973 tag .extract ()
4074
4175 link_urls = [
@@ -54,7 +88,7 @@ def cleanup_html(html_content: str, base_url: str) -> str:
5488 body_content = soup .find ("body" )
5589 if body_content :
5690 minimized_body = minify (str (body_content ))
57- return title , minimized_body , link_urls , image_urls
91+ return title , minimized_body , link_urls , image_urls , script_content
5892
5993 else :
6094 raise ValueError (
@@ -106,10 +140,10 @@ def reduce_html(html, reduction):
106140 for comment in soup .find_all (string = lambda text : isinstance (text , Comment )):
107141 comment .extract ()
108142
109- for tag in soup (["script" , " style" ]):
143+ for tag in soup (["style" ]):
110144 tag .string = ""
111145
112- attrs_to_keep = ["class" , "id" , "href" , "src" ]
146+ attrs_to_keep = ["class" , "id" , "href" , "src" , "type" ]
113147 for tag in soup .find_all (True ):
114148 for attr in list (tag .attrs ):
115149 if attr not in attrs_to_keep :
@@ -118,15 +152,15 @@ def reduce_html(html, reduction):
118152 if reduction == 1 :
119153 return minify_html (str (soup ))
120154
121- for tag in soup (["script" , " style" ]):
155+ for tag in soup (["style" ]):
122156 tag .decompose ()
123157
124158 body = soup .body
125159 if not body :
126160 return "No <body> tag found in the HTML"
127161
128162 for tag in body .find_all (string = True ):
129- if tag .parent .name not in ["script" , "style" ]:
163+ if tag .parent .name not in ["script" ]:
130164 tag .replace_with (re .sub (r"\s+" , " " , tag .strip ())[:20 ])
131165
132166 reduced_html = str (body )
0 commit comments