-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtempCodeRunnerFile.py
More file actions
157 lines (129 loc) · 6.19 KB
/
tempCodeRunnerFile.py
File metadata and controls
157 lines (129 loc) · 6.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import time
import random
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
# Load credentials
load_dotenv("credentials.env")
LINKEDIN_EMAIL = os.getenv("LINKEDIN_EMAIL")
LINKEDIN_PASSWORD = os.getenv("LINKEDIN_PASSWORD")
# Set up Chrome options
options = Options()
options.add_argument("--headless=new") # Run in headless mode (remove if debugging)
options.add_argument("--disable-gpu")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_argument("--disable-blink-features=AutomationControlled") # Bypass bot detection
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36")
# Initialize WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
def login_linkedin():
"""Logs into LinkedIn securely."""
driver.get("https://www.linkedin.com/login")
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "username"))).send_keys(LINKEDIN_EMAIL)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "password"))).send_keys(LINKEDIN_PASSWORD + Keys.RETURN)
time.sleep(random.uniform(5, 7)) # Allow login process
print("✅ Logged in successfully!")
except Exception as e:
print(f"❌ Login failed: {e}")
driver.quit()
exit()
def scroll_page():
"""Scrolls down dynamically to load more profiles."""
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.uniform(2, 4))
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def search_alumni():
"""Scrapes Soegijapranata Catholic University alumni from LinkedIn."""
search_url = "https://www.linkedin.com/school/unika-soegijapranata-semarang/people/"
driver.get(search_url)
time.sleep(random.uniform(5, 7)) # Allow page to load
alumni_list = []
scroll_page() # Scroll to load more profiles dynamically
try:
profiles = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.XPATH, '//div[contains(@class, "org-people-profile-card__profile-info")]'))
)
print(f"🔍 Found {len(profiles)} profiles")
for profile in profiles:
try:
# Extract Name
try:
name_element = profile.find_element(By.XPATH, './/div[contains(@class, "artdeco-entity-lockup__title")]')
name = name_element.text.strip() if name_element else "Unknown"
except:
name = "Unknown"
# Extract Job Title
try:
job_element = profile.find_element(By.XPATH, './/div[contains(@class, "artdeco-entity-lockup__subtitle")]')
job_title = job_element.text.strip() if job_element else "N/A"
except:
job_title = "N/A"
# Extract Profile Description
try:
description_element = profile.find_element(By.XPATH, './/div[contains(@class, "lt-line-clamp--multi-line")]')
description = description_element.text.strip() if description_element else "No description"
except:
description = "No description"
# Extract Profile Image
try:
image_element = profile.find_element(By.TAG_NAME, "img")
image_url = image_element.get_attribute("src") if image_element else "No image"
except:
image_url = "No image"
# Extract Location
try:
location_element = profile.find_element(By.XPATH, './/span[contains(@class, "text-align-center")]')
location = location_element.text.strip() if location_element else "Unknown"
except:
location = "Unknown"
# Extract Graduation Year (if available)
try:
grad_year_element = profile.find_element(By.XPATH, './/span[contains(text(), "Class of")]')
grad_year = grad_year_element.text.strip().replace("Class of ", "") if grad_year_element else "N/A"
except:
grad_year = "N/A"
print(f"👤 {name} | 💼 {job_title} | 📜 {description} | 🖼️ {image_url} | 📍 {location} | 🎓 {grad_year}") # Debugging Output
alumni_list.append({
"Name": name,
"Job Title": job_title,
"Profile Description": description,
"Profile Image": image_url,
"Location": location,
"Graduation Year": grad_year
})
except Exception as e:
print(f"⚠️ Error extracting profile: {e}")
continue
except Exception as e:
print(f"❌ No profiles found: {e}")
print(f"✅ Scraped {len(alumni_list)} profiles")
return alumni_list
def save_to_csv(data):
"""Saves the scraped data to a CSV file with separate columns."""
df = pd.DataFrame(data, columns=["Name", "Job Title", "Profile Description", "Profile Image", "Location", "Graduation Year"])
filename = "LinkedIn_SCU_Alumni.csv"
df.to_csv(filename, index=False)
print(f"✅ Data saved to {filename}")
def main():
"""Main function to execute the scraper."""
login_linkedin()
alumni_data = search_alumni()
save_to_csv(alumni_data)
driver.quit()
if __name__ == "__main__":
main()