From 18500189579eea8f45a29a8de3dccca10062b3d7 Mon Sep 17 00:00:00 2001 From: Digvijay Narayan Date: Mon, 22 Apr 2024 01:13:56 +0530 Subject: [PATCH 01/13] added staff model and page --- pesuacademy/models/staff.py | 27 ++++++++ pesuacademy/pages/staff.py | 126 ++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 pesuacademy/models/staff.py create mode 100644 pesuacademy/pages/staff.py diff --git a/pesuacademy/models/staff.py b/pesuacademy/models/staff.py new file mode 100644 index 0000000..0c642f5 --- /dev/null +++ b/pesuacademy/models/staff.py @@ -0,0 +1,27 @@ +class Staff: + def __init__( + self, + name: str, + designation: str, + education: list, + experience: list, + campus: str, + department: str, + domains: list, + Responsibilities: list, + mail : str + ): + self.name = name + self.designation = designation + self.education = education + self.experience = experience + self.department = department + self.campus = campus + self.domains = domains + self.Responsibilities = Responsibilities + self.mail = mail + + def __str__(self): + return f"{self.__dict__}" + + diff --git a/pesuacademy/pages/staff.py b/pesuacademy/pages/staff.py new file mode 100644 index 0000000..78ad391 --- /dev/null +++ b/pesuacademy/pages/staff.py @@ -0,0 +1,126 @@ +import time +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +import requests +from bs4 import BeautifulSoup +from ..models.staff import Staff + + +class StaffPageHandler: + @staticmethod + def get_staff_details() -> Staff: + try: + base_url = "https://staff.pes.edu/atoz/" + options = Options() + # options.add_argument("--disable-infobars") + options.add_argument("--headless") + driver = webdriver.Chrome() + for page_num in range(1, 23): + staff_url = f"{base_url}?page={page_num}" + response = requests.get(staff_url) + soup=BeautifulSoup(response.text,"html.parser") + staff_divs = soup.find_all('div', class_='staff-profile') + for staff_div in staff_divs: + anchor_tag = staff_div.find('a', class_='geodir-category-img_item') + if anchor_tag: + base_url_single_staff="https://staff.pes.edu/" + staff_url = anchor_tag['href'] + request_path = base_url_single_staff + staff_url[1:] + driver.get(request_path) + # time.sleep(3) + html = driver.page_source + soup = BeautifulSoup(html, 'html.parser') + PESU_STAFF=StaffPageHandler.get_details_from_url(request_path, driver) + print(PESU_STAFF) + # return PESU_STAFF + + + except Exception as e: + print(f"Error occurred: {e}") + raise ConnectionError("Unable to fetch staff data.") + finally: + driver.quit() + + @staticmethod + def get_details_from_url(url, driver): + driver.get(url) + time.sleep(3) + + html = driver.page_source + soup = BeautifulSoup(html, 'html.parser') + #name + name_tag = soup.find('h4') + name = name_tag.text.strip() if name_tag else None + #domain + teaching_items = soup.select('#tab-teaching .bookings-item-content ul.ul-item-left li') + domains = [item.text.strip() for item in teaching_items] + #designation + designation=soup.find('h5') + designation = ' '.join(designation.text.split()) + #Education + professor_education = [] + education_section = soup.find('h3', string='Education') + if education_section: + education_list = education_section.find_next('ul', class_='ul-item-left').find_all('li') + education_details = [item.find('p').text.strip() for item in education_list] + for detail in education_details: + professor_education.append(detail) + # print(professor_education) + # print() + #Experience + professor_experience=[] + experience_section = soup.find('h3', string='Experience') + if experience_section: + experience_list = experience_section.find_next('ul', class_='ul-item-left').find_all('li') + experience_details = [item.find('p').text.strip() for item in experience_list] + for detail in experience_details: + professor_experience.append(detail) + # print(professor_experience) + # print() + + + #email + all_a_tags = soup.find_all("a") + email = [ + tag for tag in all_a_tags + if "pes.edu" in tag.get("href", "") and "pes.edu" in tag.get_text() + ] + email=email[0].get_text() + + #department + department_element = soup.find('li', class_='contat-card') + department_paragraph = department_element.find('p') + department = department_paragraph.get_text(strip=True) + + #campus + campus_element=soup.find_all('li', class_='contat-card')[1] + campus_paragraph = campus_element.find('p') + campus=campus_paragraph.get_text(strip=True) + + + #responsibilities + responsibilities=[] + responsibilities_div=soup.find_all('div',class_="bookings-item-content fl-wrap")[3] + responsibilities_ul = responsibilities_div.findChild() + if responsibilities_ul: + responsibilities_li_elements=responsibilities_ul.find_all('li') + for li in responsibilities_li_elements: + responsibilities_paragraph=li.find('p') + responsibilities.append(responsibilities_paragraph.get_text(strip=True)) + + Pesu_Staff=Staff(name,designation,professor_education,professor_experience,campus,department,domains,responsibilities,email) + # Pesu_Staff.name=name + # Pesu_Staff.designation=designation + # Pesu_Staff.domains=domains + # Pesu_Staff.education=professor_education + # Pesu_Staff.experience=professor_experience + # Pesu_Staff.department=department + # Pesu_Staff.email=email + # pesu_staff.campus=campus + # Pesu_Staff.responsibilities=responsibilities + return Pesu_Staff + + From 8bf18ac2faec03219d1c491e7dd86de47e21b4c6 Mon Sep 17 00:00:00 2001 From: Digvijay Narayan Date: Mon, 22 Apr 2024 03:57:03 +0530 Subject: [PATCH 02/13] refactored according acc to prev review --- pesuacademy/models/staff.py | 18 +++++----- pesuacademy/pages/staff.py | 72 ++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 36 deletions(-) diff --git a/pesuacademy/models/staff.py b/pesuacademy/models/staff.py index 0c642f5..3874b04 100644 --- a/pesuacademy/models/staff.py +++ b/pesuacademy/models/staff.py @@ -1,15 +1,19 @@ +from typing import Optional + + + class Staff: def __init__( self, name: str, designation: str, - education: list, - experience: list, campus: str, department: str, - domains: list, - Responsibilities: list, - mail : str + mail : str, + domains: Optional[list]=None, + responsibilities: Optional[list]=None, + education: Optional[list]=None, + experience: Optional[list]=None, ): self.name = name self.designation = designation @@ -18,10 +22,8 @@ def __init__( self.department = department self.campus = campus self.domains = domains - self.Responsibilities = Responsibilities self.mail = mail + self.responsibilities = responsibilities def __str__(self): return f"{self.__dict__}" - - diff --git a/pesuacademy/pages/staff.py b/pesuacademy/pages/staff.py index 78ad391..6bdafbb 100644 --- a/pesuacademy/pages/staff.py +++ b/pesuacademy/pages/staff.py @@ -1,26 +1,29 @@ import time -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC +# from selenium import webdriver +# from selenium.webdriver.chrome.options import Options +# from selenium.webdriver.common.by import By +# from selenium.webdriver.support.ui import WebDriverWait +# from selenium.webdriver.support import expected_conditions as EC +from requests_html import HTMLSession import requests from bs4 import BeautifulSoup from ..models.staff import Staff - class StaffPageHandler: @staticmethod def get_staff_details() -> Staff: try: base_url = "https://staff.pes.edu/atoz/" - options = Options() + session=HTMLSession() + # options = Options() # options.add_argument("--disable-infobars") - options.add_argument("--headless") - driver = webdriver.Chrome() + # options.add_argument("--headless") + # driver = webdriver.Chrome() for page_num in range(1, 23): staff_url = f"{base_url}?page={page_num}" - response = requests.get(staff_url) + response = session.get(staff_url) + if response.status_code != 200: + raise ConnectionError(f"Failed to fetch URL: {staff_url}") soup=BeautifulSoup(response.text,"html.parser") staff_divs = soup.find_all('div', class_='staff-profile') for staff_div in staff_divs: @@ -29,11 +32,12 @@ def get_staff_details() -> Staff: base_url_single_staff="https://staff.pes.edu/" staff_url = anchor_tag['href'] request_path = base_url_single_staff + staff_url[1:] - driver.get(request_path) + # driver.get(request_path) # time.sleep(3) - html = driver.page_source - soup = BeautifulSoup(html, 'html.parser') - PESU_STAFF=StaffPageHandler.get_details_from_url(request_path, driver) + # html = driver.page_source + # soup = BeautifulSoup(html, 'html.parser') + # StaffPageHandler.get_details_from_url(request_path, session) + PESU_STAFF=StaffPageHandler.get_details_from_url(request_path, session) print(PESU_STAFF) # return PESU_STAFF @@ -42,15 +46,18 @@ def get_staff_details() -> Staff: print(f"Error occurred: {e}") raise ConnectionError("Unable to fetch staff data.") finally: - driver.quit() - + session.close() @staticmethod def get_details_from_url(url, driver): - driver.get(url) - time.sleep(3) + # driver.get(url) + # time.sleep(3) - html = driver.page_source - soup = BeautifulSoup(html, 'html.parser') + # html = driver.page_source + session=HTMLSession() + response=session.get(url) + if response.status_code != 200: + raise ConnectionError(f"Failed to fetch URL: {url}") + soup = BeautifulSoup(response.text, 'html.parser') #name name_tag = soup.find('h4') name = name_tag.text.strip() if name_tag else None @@ -103,14 +110,17 @@ def get_details_from_url(url, driver): #responsibilities responsibilities=[] - responsibilities_div=soup.find_all('div',class_="bookings-item-content fl-wrap")[3] - responsibilities_ul = responsibilities_div.findChild() - if responsibilities_ul: - responsibilities_li_elements=responsibilities_ul.find_all('li') - for li in responsibilities_li_elements: - responsibilities_paragraph=li.find('p') - responsibilities.append(responsibilities_paragraph.get_text(strip=True)) - + + responsibilities_div = soup.find('div', id='tab-responsibilities') + if(responsibilities_div is not None): + # print(len(responsibilities_div)) + # print(responsibilities_div) + p_tags = responsibilities_div.find_all('p') + responsibilities = [p.text for p in p_tags] + + # print(responsibilities) + # print() + Pesu_Staff=Staff(name,designation,professor_education,professor_experience,campus,department,domains,responsibilities,email) # Pesu_Staff.name=name # Pesu_Staff.designation=designation @@ -124,3 +134,9 @@ def get_details_from_url(url, driver): return Pesu_Staff +def main(): + StaffPageHandler.get_staff_details() + + +if __name__ == "__main__": + main() From 51cab00416c6c7a2568f95cba164c83af711c9bb Mon Sep 17 00:00:00 2001 From: Digvijay Narayan Date: Mon, 22 Apr 2024 03:57:46 +0530 Subject: [PATCH 03/13] removed main function in pages/staff.py --- pesuacademy/pages/staff.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pesuacademy/pages/staff.py b/pesuacademy/pages/staff.py index 6bdafbb..d905dfc 100644 --- a/pesuacademy/pages/staff.py +++ b/pesuacademy/pages/staff.py @@ -134,9 +134,3 @@ def get_details_from_url(url, driver): return Pesu_Staff -def main(): - StaffPageHandler.get_staff_details() - - -if __name__ == "__main__": - main() From df00203f1d3fad522ec98058fc4257a18d42a0b4 Mon Sep 17 00:00:00 2001 From: Digvijay Narayan Date: Tue, 23 Apr 2024 23:30:50 +0530 Subject: [PATCH 04/13] used only one session,dynamically getting the page number,function to get staff by dept --- pesuacademy/models/staff.py | 13 +- pesuacademy/pages/staff.py | 234 +++++++++++++++++++++--------------- 2 files changed, 141 insertions(+), 106 deletions(-) diff --git a/pesuacademy/models/staff.py b/pesuacademy/models/staff.py index 3874b04..b08a190 100644 --- a/pesuacademy/models/staff.py +++ b/pesuacademy/models/staff.py @@ -1,19 +1,18 @@ from typing import Optional - class Staff: def __init__( self, name: str, - designation: str, + designation: str, campus: str, department: str, - mail : str, - domains: Optional[list]=None, - responsibilities: Optional[list]=None, - education: Optional[list]=None, - experience: Optional[list]=None, + mail: str, + domains: Optional[list] = None, + responsibilities: Optional[list] = None, + education: Optional[list] = None, + experience: Optional[list] = None, ): self.name = name self.designation = designation diff --git a/pesuacademy/pages/staff.py b/pesuacademy/pages/staff.py index d905dfc..6bdb619 100644 --- a/pesuacademy/pages/staff.py +++ b/pesuacademy/pages/staff.py @@ -1,136 +1,172 @@ -import time -# from selenium import webdriver -# from selenium.webdriver.chrome.options import Options -# from selenium.webdriver.common.by import By -# from selenium.webdriver.support.ui import WebDriverWait -# from selenium.webdriver.support import expected_conditions as EC from requests_html import HTMLSession import requests from bs4 import BeautifulSoup -from ..models.staff import Staff +from ..models.staff import Staff + class StaffPageHandler: @staticmethod - def get_staff_details() -> Staff: + def get_staff_details() -> list: try: base_url = "https://staff.pes.edu/atoz/" - session=HTMLSession() - # options = Options() - # options.add_argument("--disable-infobars") - # options.add_argument("--headless") - # driver = webdriver.Chrome() - for page_num in range(1, 23): + session = HTMLSession() + response = session.get(base_url) + if response.status_code != 200: + raise ConnectionError(f"Failed to fetch URL: {base_url}") + + soup = BeautifulSoup(response.text, "html.parser") + last_page_span = soup.find( + "span", {"aria-hidden": "true"} + ) # getting the last page from the pagination end + last_page_number = int(last_page_span.get_text()) + PESU_STAFF_LIST = [] + for page_num in range(1, last_page_number + 1): + print("Scraping page:", page_num) staff_url = f"{base_url}?page={page_num}" - response = session.get(staff_url) - if response.status_code != 200: - raise ConnectionError(f"Failed to fetch URL: {staff_url}") - soup=BeautifulSoup(response.text,"html.parser") - staff_divs = soup.find_all('div', class_='staff-profile') + response = session.get(staff_url) + soup = BeautifulSoup(response.text, "html.parser") + + staff_divs = soup.find_all("div", class_="staff-profile") for staff_div in staff_divs: - anchor_tag = staff_div.find('a', class_='geodir-category-img_item') + anchor_tag = staff_div.find("a", class_="geodir-category-img_item") if anchor_tag: - base_url_single_staff="https://staff.pes.edu/" - staff_url = anchor_tag['href'] + base_url_single_staff = "https://staff.pes.edu/" + staff_url = anchor_tag["href"] request_path = base_url_single_staff + staff_url[1:] - # driver.get(request_path) - # time.sleep(3) - # html = driver.page_source - # soup = BeautifulSoup(html, 'html.parser') - # StaffPageHandler.get_details_from_url(request_path, session) - PESU_STAFF=StaffPageHandler.get_details_from_url(request_path, session) - print(PESU_STAFF) - # return PESU_STAFF + PESU_STAFF = StaffPageHandler.get_details_from_url( + request_path, session + ) + PESU_STAFF_LIST.append(PESU_STAFF) + return PESU_STAFF_LIST except Exception as e: print(f"Error occurred: {e}") raise ConnectionError("Unable to fetch staff data.") finally: session.close() - @staticmethod - def get_details_from_url(url, driver): - # driver.get(url) - # time.sleep(3) - # html = driver.page_source - session=HTMLSession() - response=session.get(url) + @staticmethod + def get_details_from_url(url, session): + response = session.get(url) if response.status_code != 200: raise ConnectionError(f"Failed to fetch URL: {url}") - soup = BeautifulSoup(response.text, 'html.parser') - #name - name_tag = soup.find('h4') + soup = BeautifulSoup(response.text, "html.parser") + # name + name_tag = soup.find("h4") name = name_tag.text.strip() if name_tag else None - #domain - teaching_items = soup.select('#tab-teaching .bookings-item-content ul.ul-item-left li') + # domain + teaching_items = soup.select( + "#tab-teaching .bookings-item-content ul.ul-item-left li" + ) domains = [item.text.strip() for item in teaching_items] - #designation - designation=soup.find('h5') - designation = ' '.join(designation.text.split()) - #Education + # designation + designation = soup.find("h5") + designation = " ".join(designation.text.split()) + # Education professor_education = [] - education_section = soup.find('h3', string='Education') - if education_section: - education_list = education_section.find_next('ul', class_='ul-item-left').find_all('li') - education_details = [item.find('p').text.strip() for item in education_list] - for detail in education_details: - professor_education.append(detail) - # print(professor_education) - # print() - #Experience - professor_experience=[] - experience_section = soup.find('h3', string='Experience') - if experience_section: - experience_list = experience_section.find_next('ul', class_='ul-item-left').find_all('li') - experience_details = [item.find('p').text.strip() for item in experience_list] - for detail in experience_details: - professor_experience.append(detail) - # print(professor_experience) - # print() - - - #email + education_section = soup.find_all("h3") + education_section_filter = [ + h3 for h3 in education_section if h3.get_text(strip=True) == "Education" + ] + + for h3 in education_section_filter: + education_list = h3.find_next("ul", class_="ul-item-left") + if education_list: + education_items = education_list.find_all("li") + education_details = [ + item.find("p").text.strip() for item in education_items + ] + for detail in education_details: + professor_education.append(detail) + + # print(professor_education) + + # Experience + professor_experience = [] + experience_section = soup.find_all("h3") + experience_section_filter = [ + h3 for h3 in experience_section if h3.get_text(strip=True) == "Experience" + ] + for h3 in experience_section_filter: + experience_list = h3.find_next("ul", class_="ul-item-left") + if experience_list: + experience_items = experience_list.find_all("li") + experience_details = [ + item.find("p").text.strip() for item in experience_items + ] + for detail in experience_details: + professor_experience.append(detail) + + # print(professor_experience) + + # email all_a_tags = soup.find_all("a") email = [ - tag for tag in all_a_tags + tag + for tag in all_a_tags if "pes.edu" in tag.get("href", "") and "pes.edu" in tag.get_text() ] - email=email[0].get_text() - - #department - department_element = soup.find('li', class_='contat-card') - department_paragraph = department_element.find('p') + if email: + email = email[0].get_text() + # department + department_element = soup.find("li", class_="contat-card") + department_paragraph = department_element.find("p") department = department_paragraph.get_text(strip=True) + # campus + try: + campus_element = soup.find_all("li", class_="contat-card")[1] + if campus_element: + campus_paragraph = campus_element.find("p") + campus = campus_paragraph.get_text(strip=True) + except IndexError: + campus = None + # responsibilities + responsibilities = [] + responsibilities_div = soup.find("div", id="tab-responsibilities") + if responsibilities_div is not None: + p_tags = responsibilities_div.find_all("p") + responsibilities = [p.text for p in p_tags] + Pesu_Staff = Staff( + name=name, + designation=designation, + education=professor_education, + experience=professor_experience, + department=department, + campus=campus, + domains=domains, + mail=email, + responsibilities=responsibilities, + ) + return Pesu_Staff - #campus - campus_element=soup.find_all('li', class_='contat-card')[1] - campus_paragraph = campus_element.find('p') - campus=campus_paragraph.get_text(strip=True) + @staticmethod + def get_staff(department=None, designation=None): + all_staff = StaffPageHandler.get_staff_details() + print(all_staff) + filtered_staff = all_staff + if department: + # Filter staff by department + filtered_staff = [ + staff for staff in filtered_staff if staff.department == department + ] - #responsibilities - responsibilities=[] + if designation: + # Filter staff by designation + filtered_staff = [ + staff for staff in filtered_staff if staff.designation == designation + ] - responsibilities_div = soup.find('div', id='tab-responsibilities') - if(responsibilities_div is not None): - # print(len(responsibilities_div)) - # print(responsibilities_div) - p_tags = responsibilities_div.find_all('p') - responsibilities = [p.text for p in p_tags] + return filtered_staff - # print(responsibilities) - # print() - - Pesu_Staff=Staff(name,designation,professor_education,professor_experience,campus,department,domains,responsibilities,email) - # Pesu_Staff.name=name - # Pesu_Staff.designation=designation - # Pesu_Staff.domains=domains - # Pesu_Staff.education=professor_education - # Pesu_Staff.experience=professor_experience - # Pesu_Staff.department=department - # Pesu_Staff.email=email - # pesu_staff.campus=campus - # Pesu_Staff.responsibilities=responsibilities - return Pesu_Staff + +# def main(): +# #usage +# cse_staff = StaffPageHandler.get_staff(department="Computer Science") +# for staff_member in cse_staff: +# print(staff_member.name) +# if __name__ == "__main__": +# main() From 3a061e5ca064122723bf17cec4fd6dca1cd50a83 Mon Sep 17 00:00:00 2001 From: aditeyabaral Date: Tue, 30 Apr 2024 06:16:41 +0000 Subject: [PATCH 05/13] Minor refactor --- pesuacademy/models/__init__.py | 1 + pesuacademy/models/{staff.py => professor.py} | 24 ++++++------ pesuacademy/pages/__init__.py | 1 + pesuacademy/pages/{staff.py => faculty.py} | 38 ++++++++----------- pesuacademy/pesuacademy.py | 18 ++++++++- pesuacademy/util/page.py | 11 ++++++ 6 files changed, 58 insertions(+), 35 deletions(-) rename pesuacademy/models/{staff.py => professor.py} (50%) rename pesuacademy/pages/{staff.py => faculty.py} (90%) diff --git a/pesuacademy/models/__init__.py b/pesuacademy/models/__init__.py index aff3c33..cc644b7 100644 --- a/pesuacademy/models/__init__.py +++ b/pesuacademy/models/__init__.py @@ -9,3 +9,4 @@ AddressDetails, QualifyingExamination, ) +from .professor import Professor diff --git a/pesuacademy/models/staff.py b/pesuacademy/models/professor.py similarity index 50% rename from pesuacademy/models/staff.py rename to pesuacademy/models/professor.py index b08a190..0e83932 100644 --- a/pesuacademy/models/staff.py +++ b/pesuacademy/models/professor.py @@ -1,18 +1,18 @@ from typing import Optional -class Staff: +class Professor: def __init__( - self, - name: str, - designation: str, - campus: str, - department: str, - mail: str, - domains: Optional[list] = None, - responsibilities: Optional[list] = None, - education: Optional[list] = None, - experience: Optional[list] = None, + self, + name: str, + designation: str, + campus: str, + department: str, + email: str, + domains: Optional[list] = None, + responsibilities: Optional[list] = None, + education: Optional[list] = None, + experience: Optional[list] = None, ): self.name = name self.designation = designation @@ -21,7 +21,7 @@ def __init__( self.department = department self.campus = campus self.domains = domains - self.mail = mail + self.email = email self.responsibilities = responsibilities def __str__(self): diff --git a/pesuacademy/pages/__init__.py b/pesuacademy/pages/__init__.py index c667d39..8b75fab 100644 --- a/pesuacademy/pages/__init__.py +++ b/pesuacademy/pages/__init__.py @@ -1,3 +1,4 @@ from .attendance import AttendancePageHandler from .courses import CoursesPageHandler from .profile import ProfilePageHandler +from .faculty import FacultyPageHandler diff --git a/pesuacademy/pages/staff.py b/pesuacademy/pages/faculty.py similarity index 90% rename from pesuacademy/pages/staff.py rename to pesuacademy/pages/faculty.py index 6bdb619..c535041 100644 --- a/pesuacademy/pages/staff.py +++ b/pesuacademy/pages/faculty.py @@ -1,12 +1,12 @@ -from requests_html import HTMLSession -import requests from bs4 import BeautifulSoup -from ..models.staff import Staff +import requests_html +from typing import Optional +from pesuacademy.models.professor import Professor -class StaffPageHandler: +class FacultyPageHandler: @staticmethod - def get_staff_details() -> list: + def get_staff_details() -> list[Professor]: try: base_url = "https://staff.pes.edu/atoz/" session = HTMLSession() @@ -127,7 +127,7 @@ def get_details_from_url(url, session): if responsibilities_div is not None: p_tags = responsibilities_div.find_all("p") responsibilities = [p.text for p in p_tags] - Pesu_Staff = Staff( + Pesu_Staff = Professor( name=name, designation=designation, education=professor_education, @@ -135,15 +135,20 @@ def get_details_from_url(url, session): department=department, campus=campus, domains=domains, - mail=email, + email=email, responsibilities=responsibilities, ) return Pesu_Staff - @staticmethod - def get_staff(department=None, designation=None): - all_staff = StaffPageHandler.get_staff_details() - print(all_staff) + def get_page( + self, + session: requests_html.HTMLSession, + department: Optional[str] = None, + designation: Optional[str] = None, + campus: Optional[str] = None, + ) -> list[Professor]: + # TODO: Refactor this to use specific URLs: https://staff.pes.edu/rr/atoz/computer-science/ + all_staff = self.get_staff_details() filtered_staff = all_staff if department: @@ -159,14 +164,3 @@ def get_staff(department=None, designation=None): ] return filtered_staff - - -# def main(): -# #usage -# cse_staff = StaffPageHandler.get_staff(department="Computer Science") -# for staff_member in cse_staff: -# print(staff_member.name) - - -# if __name__ == "__main__": -# main() diff --git a/pesuacademy/pesuacademy.py b/pesuacademy/pesuacademy.py index 9032018..ca4907d 100644 --- a/pesuacademy/pesuacademy.py +++ b/pesuacademy/pesuacademy.py @@ -6,7 +6,7 @@ from pesuacademy import util from pesuacademy.util.page import PageHandler from .exceptions import CSRFTokenError, AuthenticationError -from .models import Profile, ClassAndSectionInfo, Course +from .models import Profile, ClassAndSectionInfo, Course, Professor class PESUAcademy: @@ -155,3 +155,19 @@ def attendance(self, semester: Optional[int] = None) -> dict[int, list[Course]]: raise AuthenticationError("You need to authenticate first.") attendance_info = self.page_handler.get_attendance(semester) return attendance_info + + def faculty( + self, + department: Optional[str] = None, + designation: Optional[str] = None, + campus: Optional[str] = None, + ) -> list[Professor]: + """ + Get the faculty information of the university. + + :param department: The department name. + :param designation: The designation of the faculty. + :return: The faculty information. + """ + faculty_info = self.page_handler.get_faculty(department, designation, campus) + return faculty_info diff --git a/pesuacademy/util/page.py b/pesuacademy/util/page.py index c46018a..c8cf1d7 100644 --- a/pesuacademy/util/page.py +++ b/pesuacademy/util/page.py @@ -14,6 +14,7 @@ def __init__(self, session: requests_html.HTMLSession): self.course_page_handler = pages.CoursesPageHandler() self.attendance_page_handler = pages.AttendancePageHandler() self.profile_page_handler = pages.ProfilePageHandler() + self.faculty_page_handler = pages.FacultyPageHandler() def set_semester_id_to_number_mapping(self, csrf_token: str): try: @@ -80,3 +81,13 @@ def get_courses(self, semester: Optional[int] = None): def get_attendance(self, semester: Optional[int] = None): semester_ids = self.get_semester_ids_from_semester_number(semester) return self.attendance_page_handler.get_page(self.__session, semester_ids) + + def get_faculty( + self, + department: Optional[str] = None, + designation: Optional[str] = None, + campus: Optional[str] = None, + ): + return self.faculty_page_handler.get_page( + self.__session, department, designation, campus + ) From d8a7bd39ac3e5ee36a475218c69ebbe65d0059e1 Mon Sep 17 00:00:00 2001 From: aditeyabaral Date: Tue, 30 Apr 2024 07:51:24 +0000 Subject: [PATCH 06/13] Reformat code --- pesuacademy/models/professor.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pesuacademy/models/professor.py b/pesuacademy/models/professor.py index 0e83932..6644088 100644 --- a/pesuacademy/models/professor.py +++ b/pesuacademy/models/professor.py @@ -3,16 +3,16 @@ class Professor: def __init__( - self, - name: str, - designation: str, - campus: str, - department: str, - email: str, - domains: Optional[list] = None, - responsibilities: Optional[list] = None, - education: Optional[list] = None, - experience: Optional[list] = None, + self, + name: str, + designation: str, + campus: str, + department: str, + email: str, + domains: Optional[list] = None, + responsibilities: Optional[list] = None, + education: Optional[list] = None, + experience: Optional[list] = None, ): self.name = name self.designation = designation From f7b57b7f26063a2fdd8ddd8789c6904bd98a90fa Mon Sep 17 00:00:00 2001 From: aditeyabaral Date: Tue, 30 Apr 2024 07:53:05 +0000 Subject: [PATCH 07/13] Reformat code --- pesuacademy/pesuacademy.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pesuacademy/pesuacademy.py b/pesuacademy/pesuacademy.py index 4604d04..8e83130 100644 --- a/pesuacademy/pesuacademy.py +++ b/pesuacademy/pesuacademy.py @@ -4,10 +4,16 @@ from bs4 import BeautifulSoup from pesuacademy import util -from pesuacademy.models.seating_information import SeatingInformation from pesuacademy.util.page import PageHandler from .exceptions import CSRFTokenError, AuthenticationError -from .models import Profile, ClassAndSectionInfo, Course, Announcement, Professor +from .models import ( + Profile, + ClassAndSectionInfo, + Course, + Announcement, + Professor, + SeatingInformation, +) class PESUAcademy: From 9ad3b5793c28465ef26fc0d221d2c656140ba726 Mon Sep 17 00:00:00 2001 From: aditeyabaral Date: Tue, 30 Apr 2024 19:06:37 +0000 Subject: [PATCH 08/13] Minor changes --- pesuacademy/pages/faculty.py | 82 ++++++++++++++++++++++++++++-------- pesuacademy/pesuacademy.py | 5 ++- pesuacademy/util/page.py | 7 ++- 3 files changed, 72 insertions(+), 22 deletions(-) diff --git a/pesuacademy/pages/faculty.py b/pesuacademy/pages/faculty.py index c535041..3489db0 100644 --- a/pesuacademy/pages/faculty.py +++ b/pesuacademy/pages/faculty.py @@ -5,6 +5,63 @@ class FacultyPageHandler: + departments = { + "arch": "architecture", + "bt": "biotechnology", + "cv": "civil", + "cse": "computer-science", + "cse-aiml": "computer-science-AIML", + "ca": "computer-application", + "des": "design", + "eee": "electrical-&-electronics", + "ece": "electronics-&-communications", + "law": "law", + "me": "mechanical", + "ms": "management-studies", + "sh": "science-&-humanities", + "com": "commerce", + "psy": "psychology", + "cie": "centre-for-innovation-&-entrepreneurship", + "ps": "pharmaceutical-sciences", + } + campuses = ["rr", "ec", "hn"] + + @staticmethod + def get_urls_from_campus_and_department( + campus: Optional[str], department: Optional[str] + ): + base_url = "https://staff.pes.edu/{campus}/atoz/{department}" + if department: + assert ( + department in FacultyPageHandler.departments + ), "Invalid department provided." + if campus: + assert campus in FacultyPageHandler.campuses, "Invalid campus provided." + + if not department and not campus: + urls = [base_url.format(campus="", department="")] + elif department and not campus: + urls = [ + base_url.format( + campus=campus, department=FacultyPageHandler.departments[department] + ) + for campus in ["rr", "ec", "hn"] + ] + elif campus and not department: + urls = [ + base_url.format( + campus=campus, department=FacultyPageHandler.departments[department] + ) + for department in FacultyPageHandler.departments + ] + else: + urls = [ + base_url.format( + campus=campus, department=FacultyPageHandler.departments[department] + ) + ] + return urls + @staticmethod def get_staff_details() -> list[Professor]: try: @@ -143,24 +200,13 @@ def get_details_from_url(url, session): def get_page( self, session: requests_html.HTMLSession, + campus: Optional[str] = None, department: Optional[str] = None, designation: Optional[str] = None, - campus: Optional[str] = None, ) -> list[Professor]: - # TODO: Refactor this to use specific URLs: https://staff.pes.edu/rr/atoz/computer-science/ - all_staff = self.get_staff_details() - filtered_staff = all_staff - - if department: - # Filter staff by department - filtered_staff = [ - staff for staff in filtered_staff if staff.department == department - ] - - if designation: - # Filter staff by designation - filtered_staff = [ - staff for staff in filtered_staff if staff.designation == designation - ] - - return filtered_staff + urls = self.get_urls_from_campus_and_department(campus, department) + # TODO: Scrape the data from the URLs. Use the same session object provided. + # professors = list() + # for url in urls: + # professors.extend(get_faculty(session, url)) + # return professors diff --git a/pesuacademy/pesuacademy.py b/pesuacademy/pesuacademy.py index 8e83130..2b47a97 100644 --- a/pesuacademy/pesuacademy.py +++ b/pesuacademy/pesuacademy.py @@ -165,18 +165,19 @@ def attendance(self, semester: Optional[int] = None) -> dict[int, list[Course]]: def faculty( self, + campus: Optional[str] = None, department: Optional[str] = None, designation: Optional[str] = None, - campus: Optional[str] = None, ) -> list[Professor]: """ Get the faculty information of the university. + :param campus: The campus name. :param department: The department name. :param designation: The designation of the faculty. :return: The faculty information. """ - faculty_info = self.page_handler.get_faculty(department, designation, campus) + faculty_info = self.page_handler.get_faculty(campus, department, designation) return faculty_info def seating_information(self) -> list[SeatingInformation]: diff --git a/pesuacademy/util/page.py b/pesuacademy/util/page.py index c396e9e..8cef248 100644 --- a/pesuacademy/util/page.py +++ b/pesuacademy/util/page.py @@ -85,12 +85,15 @@ def get_attendance(self, semester: Optional[int] = None): def get_faculty( self, + campus: Optional[str] = None, department: Optional[str] = None, designation: Optional[str] = None, - campus: Optional[str] = None, ): return self.faculty_page_handler.get_page( - self.__session, department, designation, campus + self.__session, + campus, + department, + designation, ) def get_seating_information(self): From fc7c539cb04c1a8f14e034918a0090ffd91749c9 Mon Sep 17 00:00:00 2001 From: aditeyabaral Date: Tue, 30 Apr 2024 19:10:09 +0000 Subject: [PATCH 09/13] Fix unspecified named paths --- pesuacademy/pages/faculty.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pesuacademy/pages/faculty.py b/pesuacademy/pages/faculty.py index 3489db0..c17dba2 100644 --- a/pesuacademy/pages/faculty.py +++ b/pesuacademy/pages/faculty.py @@ -66,7 +66,7 @@ def get_urls_from_campus_and_department( def get_staff_details() -> list[Professor]: try: base_url = "https://staff.pes.edu/atoz/" - session = HTMLSession() + session = requests_html.HTMLSession() response = session.get(base_url) if response.status_code != 200: raise ConnectionError(f"Failed to fetch URL: {base_url}") @@ -90,7 +90,7 @@ def get_staff_details() -> list[Professor]: base_url_single_staff = "https://staff.pes.edu/" staff_url = anchor_tag["href"] request_path = base_url_single_staff + staff_url[1:] - PESU_STAFF = StaffPageHandler.get_details_from_url( + PESU_STAFF = FacultyPageHandler.get_details_from_url( request_path, session ) PESU_STAFF_LIST.append(PESU_STAFF) From 08995e95bb50957fbc748a8b8d0d1bab958a3481 Mon Sep 17 00:00:00 2001 From: aditeyabaral Date: Fri, 3 May 2024 18:17:18 +0000 Subject: [PATCH 10/13] Reformat code --- pesuacademy/pages/faculty.py | 95 ++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/pesuacademy/pages/faculty.py b/pesuacademy/pages/faculty.py index c17dba2..641ae33 100644 --- a/pesuacademy/pages/faculty.py +++ b/pesuacademy/pages/faculty.py @@ -63,45 +63,59 @@ def get_urls_from_campus_and_department( return urls @staticmethod - def get_staff_details() -> list[Professor]: + def get_all_faculty_ids_from_url( + session: requests_html.HTMLSession, url: str, page: int = 1 + ) -> list[str]: try: - base_url = "https://staff.pes.edu/atoz/" - session = requests_html.HTMLSession() - response = session.get(base_url) + current_url = f"{url}?page={page}" + print("entered loop", page, current_url) + response = session.get(current_url) if response.status_code != 200: - raise ConnectionError(f"Failed to fetch URL: {base_url}") - - soup = BeautifulSoup(response.text, "html.parser") - last_page_span = soup.find( - "span", {"aria-hidden": "true"} - ) # getting the last page from the pagination end - last_page_number = int(last_page_span.get_text()) - PESU_STAFF_LIST = [] - for page_num in range(1, last_page_number + 1): - print("Scraping page:", page_num) - staff_url = f"{base_url}?page={page_num}" - response = session.get(staff_url) + return [] + else: soup = BeautifulSoup(response.text, "html.parser") - - staff_divs = soup.find_all("div", class_="staff-profile") - for staff_div in staff_divs: - anchor_tag = staff_div.find("a", class_="geodir-category-img_item") - if anchor_tag: - base_url_single_staff = "https://staff.pes.edu/" - staff_url = anchor_tag["href"] - request_path = base_url_single_staff + staff_url[1:] - PESU_STAFF = FacultyPageHandler.get_details_from_url( - request_path, session + if next_page := soup.find("a", class_="nextposts-link"): + next_page_number = int(next_page["href"].split("?page=")[-1]) + else: + next_page_number = None + + print("Next page number", next_page_number) + faculty_divs = soup.find_all("div", class_="staff-profile") + faculty_ids = [ + div.find("a", class_="geodir-category-img_item")["href"].split("/")[ + -2 + ] + for div in faculty_divs + ] + if next_page_number is not None: + faculty_ids.extend( + FacultyPageHandler.get_all_faculty_ids_from_url( + session, url, next_page_number ) - PESU_STAFF_LIST.append(PESU_STAFF) + ) + return faculty_ids + except Exception: + return [] - return PESU_STAFF_LIST + @staticmethod + def get_faculty_by_id( + session: requests_html.HTMLSession, faculty_id: str + ) -> Professor: + url = f"https://staff.pes.edu/{faculty_id}" + response = session.get(url) + if response.status_code != 200: + raise ConnectionError(f"Failed to fetch URL: {url}") - except Exception as e: - print(f"Error occurred: {e}") - raise ConnectionError("Unable to fetch staff data.") - finally: - session.close() + soup = BeautifulSoup(response.text, "html.parser") + name = soup.find("h4").text.strip() + domains = [ + item.text.strip() + for item in soup.select( + "#tab-teaching .bookings-item-content ul.ul-item-left li" + ) + ] + designation = soup.find("h5").text.strip() + print() @staticmethod def get_details_from_url(url, session): @@ -206,7 +220,14 @@ def get_page( ) -> list[Professor]: urls = self.get_urls_from_campus_and_department(campus, department) # TODO: Scrape the data from the URLs. Use the same session object provided. - # professors = list() - # for url in urls: - # professors.extend(get_faculty(session, url)) - # return professors + # TODO: Add search functionality for name: https://staff.pes.edu/atoz/list/?search={name} + professors: list[Professor] = list() + for url in urls: + faculty_ids = self.get_all_faculty_ids_from_url(session, url, page=1) + for faculty_id in faculty_ids: + professors.extend(self.get_faculty_by_id(session, faculty_id)) + if designation is not None: + professors = list( + filter(lambda x: x.designation == designation, professors) + ) + return professors From ae7ae14338342c386d41b77351ddd31e412f119f Mon Sep 17 00:00:00 2001 From: Digvijay Narayan Date: Tue, 7 May 2024 22:08:48 +0530 Subject: [PATCH 11/13] completed search feature for faculty and edited get_page function --- pesuacademy/pages/faculty.py | 62 ++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/pesuacademy/pages/faculty.py b/pesuacademy/pages/faculty.py index 641ae33..24f901e 100644 --- a/pesuacademy/pages/faculty.py +++ b/pesuacademy/pages/faculty.py @@ -4,6 +4,7 @@ from pesuacademy.models.professor import Professor + class FacultyPageHandler: departments = { "arch": "architecture", @@ -219,15 +220,56 @@ def get_page( designation: Optional[str] = None, ) -> list[Professor]: urls = self.get_urls_from_campus_and_department(campus, department) - # TODO: Scrape the data from the URLs. Use the same session object provided. - # TODO: Add search functionality for name: https://staff.pes.edu/atoz/list/?search={name} - professors: list[Professor] = list() + professors: list[Professor] = [] for url in urls: - faculty_ids = self.get_all_faculty_ids_from_url(session, url, page=1) - for faculty_id in faculty_ids: - professors.extend(self.get_faculty_by_id(session, faculty_id)) - if designation is not None: - professors = list( - filter(lambda x: x.designation == designation, professors) - ) + response = session.get(url) + if response.status_code != 200: + raise ConnectionError(f"Failed to fetch URL: {url}") + soup = BeautifulSoup(response.text, "html.parser") + faculty_divs = soup.find_all("div", class_="staff-profile") + for faculty_div in faculty_divs: + anchor_tag = faculty_div.find("a", class_="geodir-category-img_item") + base_url_single_faculty = "https://staff.pes.edu/" + faculty_url = anchor_tag["href"] + request_path = base_url_single_faculty + faculty_url[1:] + professor = self.get_details_from_url(request_path, session) + professors.append(professor) + if designation: + professors = [professor for professor in professors if designation in professor.designation] return professors + + + # TODO: Add search functionality for name: https://staff.pes.edu/atoz/list/?search={name} + def get_faculty_by_name(self, name: str, session: requests_html.HTMLSession) -> list[Professor]: + professors: list[Professor] = [] + url = f"https://staff.pes.edu/atoz/list/?search={name}" + response=session.get(url) + soup = BeautifulSoup(response.text, "html.parser") + # professor_names = [tag.text.strip() for tag in soup.find_all('h4')] + # print(professor_names) + soup = BeautifulSoup(response.text, "html.parser") + faculty_divs = soup.find_all("div", class_="col-md-3 left-padding-0") + for faculty_div in faculty_divs: + anchor_tag = faculty_div.find("a", class_="chat-contacts-item") + if anchor_tag: + faculty_url = anchor_tag["href"] + base_url_single_faculty = "https://staff.pes.edu" + request_path = base_url_single_faculty + faculty_url + professor = self.get_details_from_url(request_path, session) + print(professor) + professors.append(professor) + print(professors) + # return professors + # professors: list[Professor] = list() + # for url in urls: + # faculty_ids = self.get_all_faculty_ids_from_url(session, url, page=1) + # for faculty_id in faculty_ids: + # professors.extend(self.get_faculty_by_id(session, faculty_id)) + # if designation is not None: + # professors = list( + # filter(lambda x: x.designation == designation, professors) + # ) + # return professors + + + From d7630bbaeecd2845e7203f2b84819c6e774f64cf Mon Sep 17 00:00:00 2001 From: aditeyabaral Date: Fri, 10 May 2024 14:25:54 +0000 Subject: [PATCH 12/13] Revert commit --- pesuacademy/pages/faculty.py | 59 +++++------------------------------- 1 file changed, 7 insertions(+), 52 deletions(-) diff --git a/pesuacademy/pages/faculty.py b/pesuacademy/pages/faculty.py index 24f901e..240bd64 100644 --- a/pesuacademy/pages/faculty.py +++ b/pesuacademy/pages/faculty.py @@ -4,7 +4,6 @@ from pesuacademy.models.professor import Professor - class FacultyPageHandler: departments = { "arch": "architecture", @@ -220,56 +219,12 @@ def get_page( designation: Optional[str] = None, ) -> list[Professor]: urls = self.get_urls_from_campus_and_department(campus, department) - professors: list[Professor] = [] + # TODO: Add search functionality for name: https://staff.pes.edu/atoz/list/?search={name} + professors: list[Professor] = list() for url in urls: - response = session.get(url) - if response.status_code != 200: - raise ConnectionError(f"Failed to fetch URL: {url}") - soup = BeautifulSoup(response.text, "html.parser") - faculty_divs = soup.find_all("div", class_="staff-profile") - for faculty_div in faculty_divs: - anchor_tag = faculty_div.find("a", class_="geodir-category-img_item") - base_url_single_faculty = "https://staff.pes.edu/" - faculty_url = anchor_tag["href"] - request_path = base_url_single_faculty + faculty_url[1:] - professor = self.get_details_from_url(request_path, session) - professors.append(professor) - if designation: - professors = [professor for professor in professors if designation in professor.designation] + faculty_ids = self.get_all_faculty_ids_from_url(session, url, page=1) + for faculty_id in faculty_ids: + professor = self.get_faculty_by_id(session, faculty_id) + if designation is None or professor.designation == designation: + professors.append(professor) return professors - - - # TODO: Add search functionality for name: https://staff.pes.edu/atoz/list/?search={name} - def get_faculty_by_name(self, name: str, session: requests_html.HTMLSession) -> list[Professor]: - professors: list[Professor] = [] - url = f"https://staff.pes.edu/atoz/list/?search={name}" - response=session.get(url) - soup = BeautifulSoup(response.text, "html.parser") - # professor_names = [tag.text.strip() for tag in soup.find_all('h4')] - # print(professor_names) - soup = BeautifulSoup(response.text, "html.parser") - faculty_divs = soup.find_all("div", class_="col-md-3 left-padding-0") - for faculty_div in faculty_divs: - anchor_tag = faculty_div.find("a", class_="chat-contacts-item") - if anchor_tag: - faculty_url = anchor_tag["href"] - base_url_single_faculty = "https://staff.pes.edu" - request_path = base_url_single_faculty + faculty_url - professor = self.get_details_from_url(request_path, session) - print(professor) - professors.append(professor) - print(professors) - # return professors - # professors: list[Professor] = list() - # for url in urls: - # faculty_ids = self.get_all_faculty_ids_from_url(session, url, page=1) - # for faculty_id in faculty_ids: - # professors.extend(self.get_faculty_by_id(session, faculty_id)) - # if designation is not None: - # professors = list( - # filter(lambda x: x.designation == designation, professors) - # ) - # return professors - - - From 153e7ba6ebe7b81c903363f825aca8d3c6500369 Mon Sep 17 00:00:00 2001 From: Digvijay Narayan Date: Fri, 7 Jun 2024 04:12:38 +0530 Subject: [PATCH 13/13] removed unwanted print statements --- pesuacademy/pages/faculty.py | 58 +++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/pesuacademy/pages/faculty.py b/pesuacademy/pages/faculty.py index 240bd64..8b5bcae 100644 --- a/pesuacademy/pages/faculty.py +++ b/pesuacademy/pages/faculty.py @@ -94,6 +94,7 @@ def get_all_faculty_ids_from_url( ) ) return faculty_ids + except Exception: return [] @@ -102,6 +103,7 @@ def get_faculty_by_id( session: requests_html.HTMLSession, faculty_id: str ) -> Professor: url = f"https://staff.pes.edu/{faculty_id}" + # print(url) response = session.get(url) if response.status_code != 200: raise ConnectionError(f"Failed to fetch URL: {url}") @@ -115,26 +117,9 @@ def get_faculty_by_id( ) ] designation = soup.find("h5").text.strip() - print() - - @staticmethod - def get_details_from_url(url, session): - response = session.get(url) - if response.status_code != 200: - raise ConnectionError(f"Failed to fetch URL: {url}") - soup = BeautifulSoup(response.text, "html.parser") - # name - name_tag = soup.find("h4") - name = name_tag.text.strip() if name_tag else None - # domain - teaching_items = soup.select( - "#tab-teaching .bookings-item-content ul.ul-item-left li" - ) - domains = [item.text.strip() for item in teaching_items] - # designation - designation = soup.find("h5") - designation = " ".join(designation.text.split()) - # Education + designation = [d.strip() for d in designation.split(",")] + # print() + # Education professor_education = [] education_section = soup.find_all("h3") education_section_filter = [ @@ -198,6 +183,7 @@ def get_details_from_url(url, session): if responsibilities_div is not None: p_tags = responsibilities_div.find_all("p") responsibilities = [p.text for p in p_tags] + Pesu_Staff = Professor( name=name, designation=designation, @@ -210,6 +196,28 @@ def get_details_from_url(url, session): responsibilities=responsibilities, ) return Pesu_Staff + + def get_faculty_by_name(self, name: str, session: requests_html.HTMLSession) -> list[Professor]: + professors: list[Professor] = [] + url = f"https://staff.pes.edu/atoz/list/?search={name}" + response = session.get(url) + soup = BeautifulSoup(response.text, "html.parser") + faculty_divs = soup.find_all("div", class_="col-md-3 left-padding-0") + + faculty_ids = [ + div.find("a", class_="chat-contacts-item")["href"].split("/")[-2] + for div in faculty_divs + ] + print(faculty_ids) + # Retrieve details for each faculty ID + for faculty_id in faculty_ids: + professor = self.get_faculty_by_id(session, faculty_id) + if professor: + professors.append(professor) + + return professors + + def get_page( self, @@ -217,14 +225,22 @@ def get_page( campus: Optional[str] = None, department: Optional[str] = None, designation: Optional[str] = None, + name:Optional[str] = None ) -> list[Professor]: urls = self.get_urls_from_campus_and_department(campus, department) # TODO: Add search functionality for name: https://staff.pes.edu/atoz/list/?search={name} + if name: + professors=self.get_faculty_by_name(name,session) + return professors + print(urls) professors: list[Professor] = list() for url in urls: faculty_ids = self.get_all_faculty_ids_from_url(session, url, page=1) for faculty_id in faculty_ids: professor = self.get_faculty_by_id(session, faculty_id) - if designation is None or professor.designation == designation: + # print(professor.designation) + if designation is None or designation in professor.designation: professors.append(professor) return professors + +