-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathscrap.py
More file actions
74 lines (52 loc) · 2.23 KB
/
scrap.py
File metadata and controls
74 lines (52 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# import requests
# from bs4 import BeautifulSoup
# # URL of the webpage to scrape
# url = 'https://www.croma.com/unboxed/best-3-star-acs'
# # Send a GET request to the webpage
# response = requests.get(url)
# # Check if the request was successful
# if response.status_code == 200:
# # Parse the HTML content
# soup = BeautifulSoup(response.content, 'html.parser')
# clean_text = BeautifulSoup(str(soup), 'html.parser').get_text(strip=True)
# # Define the tags you want to scrape content from
# # tags = ['a', 'p', 'li', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
# print(clean_text)
# # tags=["p","span","h1","h2","h3"]
# # # Extract and print content from the specified tags
# # for tag in tags:
# # elements = soup.find_all(tag)
# # for element in elements:
# # # Clean the text and print it
# # text = element.get_text(strip=True)
# # if text: # Only print non-empty text
# # print(f"<{tag}>: {text}")
# else:
# print(f"Failed to retrieve webpage. Status code: {response.status_code}")
import requests
from bs4 import BeautifulSoup
import re
# URL of the webpage to scrape
def scrapp(url):
try:
# Send a GET request to the webpage
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content
response = requests.get(url)
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Extract all text content from the page
text_content = soup.get_text()
# Optionally clean up the text (e.g., remove excess whitespace)
cleaned_text = ' '.join(text_content.split())
return cleaned_text
# return "\n".join(maal)
else:
return f"Failed to retrieve webpage. Status code: {response.status_code}"
except:
return f"Failed to retrieve webpage"
# print(scrapp("https://www.statista.com/statistics/1018500/india-leading-ac-providers-market-share/"))
# # Fetch the page content
# url = 'https://example.com' # Replace with the URL you want to scrape