-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_scraper.py
157 lines (113 loc) · 4.64 KB
/
web_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from time import sleep
import os
import pandas as pd
import boto3
def crawler(link):
cwd = os.getcwd()
youtube_link = "https://www.youtube.com/watch?v="+link
#PATH = cwd + "\ChromeDriver\chromedriver.exe"
'''PATH = "/usr/local/bin/chromedriver"
# driver.get does not work all the time. So try except implementation
driver = None
new_driver = None
while driver is None:
try:
new_driver = webdriver.Chrome(service=ChromeService(PATH))
new_driver.get(youtube_link)
driver = new_driver
except:
if new_driver is not None:
new_driver.quit()
sleep(5)'''
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Run Chrome in headless mode (without GUI)
options.add_argument('--no-sandbox')
#PATH = "/usr/local/bin/chromedriver"
# driver.get does not work all the time. So try except implementation
driver = None
new_driver = None
while driver is None:
try:
new_driver = webdriver.Chrome(options = options)
new_driver.get(youtube_link)
driver = new_driver
except:
if new_driver is not None:
new_driver.quit()
sleep(5)
prev_height = 0
scroll_counter = 100
# Scroll Down all coments
while(True):
height = driver.execute_script('return document.documentElement.scrollHeight')
driver.execute_script("window.scrollTo(0, " + str(height) + ");")
if prev_height == height:
scroll_counter-=1
if scroll_counter == 0:
break
else:
scroll_counter = 100
prev_height = height
# Checking if ad exists and press no thanks
ad_path = "/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog/yt-mealbar-promo-renderer/div/div[2]/yt-button-renderer[1]"
try:
ad = driver.find_element("xpath", ad_path)
ad.click()
except:
print("No ad")
def get_content(comment):
content_text = ''
content = comment.find_element("xpath", ".//yt-formatted-string[@id = 'content-text']")
#contents = comment.find_elements("xpath", "//*[@id='content-text']/span")
try:
for sub_content in content.find_element("xpath", "./span"):
content_text = content_text + sub_content.text + " "
except:
content_text += content.text
return content_text
df = pd.DataFrame(columns = ["Comments"])
comments_XPATH = "/html/body/ytd-app/div[1]/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[2]/ytd-comments/ytd-item-section-renderer/div[3]"
comments_section = driver.find_element("xpath", comments_XPATH)
comments = comments_section.find_elements("xpath",'./ytd-comment-thread-renderer')
print(len(comments))
# Press all view replies button to comments
for comment in comments:
try:
see_replies = comment.find_element("xpath", './/ytd-button-renderer[@id="more-replies"]')
see_replies.click()
except:
continue
# This is not the same as previous comments
comments = comments_section.find_elements("xpath",'.//ytd-comment-thread-renderer')
for comment in comments:
content = get_content(comment)
df.loc[len(df)] = content
try:
replies_section = comment.find_element("xpath", ".//ytd-comment-replies-renderer")
replies = replies_section.find_elements("xpath", ".//ytd-comment-renderer")
for reply in replies:
content = get_content(reply)
df.loc[len(df)] = content
except:
continue
print(df)
# close the chrome browser convert dataframe to csvMod
driver.close()
df.to_csv("new_file.csv")
s3 = boto3.resource("s3")
bucket_name = ''
for bucket in s3.buckets.all():
bucket_name = bucket.name
# Uploading the comments in the csv file to our bucket
aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
aws_region = os.environ.get('AWS_DEFAULT_REGION')
s3_client = boto3.client("s3", aws_secret_access_key = aws_secret_access_key, aws_access_key_id = aws_access_key_id, region_name = aws_region)
s3_client.upload_file(
Filename = "new_file.csv",
Bucket = bucket_name,
Key = link + ".csv"
)
return False