Name & email Extract

note that - still building this code as universal for all type of web pages to extract the name and email from the page

this is working for many type of links

import os
import re
import sys
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib3

# Suppress messy InsecureRequestWarning warnings in terminal screen
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Robust patterns for global email and phone discovery
EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
PHONE_PATTERN = re.compile(r"(?:\+94|0)[1-9][0-9]\s?[- ]?[0-9]{3}\s?[- ]?[0-9]{4}")

# Loose Title matching pattern to clean titles out of names if they bleed in
DESIGNATION_CLEAN_PATTERN = re.compile(
    r"\b(Senior Professor|Senior Lecturer|Assistant Lecturer|Professor|Lecturer|Instructor|Head|Dean|HOD|Probationary|Grade I|Grade II|II|I|Visiting|Chair)\b",
    re.IGNORECASE
)

# Skips pages that do not host individual personnel profiles
BLACKLIST_KEYWORDS = [
    'accreditation', 'scholarship', 'schedule', 'guidelines', 'structure', 'research', 'mile-stone', 'grants', 
    'partners-in-leaning','student-handbook', 'news', 'programme', 'internship', 'dissertation', 'exemption', 
    'calendar-of-dates', 'heads-of-the-department','library','student-resources','msu', 'curriculum', 'syllabus', 
    'fee', 'exam', 'admission', 'course','activity','ethics-review-committee','iqac','alumni','video-resources',
    'history', 'gallery', 'event', 'notice', 'download', 'publication'
]

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1"
}

# Persistent network session to speed up sequential loop downloads
session = requests.Session()
session.headers.update(HEADERS)
session.verify = False  # Global bypass for local university SSL configuration errors

def clean_extracted_name(name_str):
    """Removes any academic designation keywords that leaked into the name string."""
    if not name_str:
        return "Name Check Required"
    name_str = DESIGNATION_CLEAN_PATTERN.sub("", name_str)
    name_str = re.sub(r'^\s*[-–—,•|:]\s*|\s*[-–—,•|:]\s*\$', '', name_str)
    return " ".join(name_str.split()).strip()

def discover_name_on_profile(soup):
    """Finds the person's name on their profile page by checking top-level headings or title."""
    for tag_name in ['h1', 'h2', 'h3']:
        for heading in soup.find_all(tag_name):
            text = heading.get_text().strip()
            if any(title in text.lower() for title in ['prof', 'dr.', 'mr.', 'ms.', 'mrs.']):
                return clean_extracted_name(text)

    if soup.title and soup.title.string:
        try:
            # Iteratively processes delimiters to split safely
            title_text = soup.title.string
            for delimiter in ['|', '-']:
                if delimiter in title_text:
                    # Extracts the first contextual block before the divider safely
                    title_text = title_text.split(delimiter)[0]
            title_text = title_text.strip()

            if any(title in title_text.lower() for title in ['prof', 'dr.', 'mr.', 'ms.', 'mrs.', 'staff']):
                return clean_extracted_name(title_text.replace('Staff', '').replace('Profile', ''))
        except Exception:
            pass

    return "Name Check Required"

def discover_designation_on_profile(soup):
    """Scans the profile text elements to classify their exact academic rank."""
    page_text = soup.get_text(separator=" ")

    leader_match = re.search(r'\b(Head of Department|Head|HOD|Dean)\b', page_text, re.IGNORECASE)
    if leader_match:
        val = leader_match.group(0).lower()
        return "HOD" if val == 'hod' else leader_match.group(0).title()

    rank_match = re.search(r'\b(Senior Professor|Senior Lecturer|Assistant Lecturer|Lecturer|Professor)\s*(?:Grade\s*[IVX12i]+|[IVX12i]+|（Confirmed）|（Probationary）)?\b', page_text, re.IGNORECASE)
    if rank_match:
        return " ".join(rank_match.group(0).split()).title()

    return "Academic Staff"

def parse_details_from_snippet(text):
    """Uses contextual regex patterns to filter designations, phone lines, and formal names."""
    clean_text = " ".join(text.split())

    desig_match = DESIGNATION_CLEAN_PATTERN.search(clean_text)
    designation = desig_match.group(0).strip() if desig_match else "Lecturer / Academic Staff"

    leader_match = re.search(r'\b(Head of Department|Head|HOD|Dean)\b', clean_text, re.IGNORECASE)
    if leader_match:
        val = leader_match.group(0).lower()
        designation = "HOD" if val == 'hod' else leader_match.group(0).title()
    else:
        grade_match = re.search(r'\b(Senior Lecturer|Lecturer|Assistant Lecturer)\s*(?:Grade\s*[IVX12i]+|[IVX12i]+)?\b', clean_text, re.IGNORECASE)
        if grade_match:
            designation = " ".join(grade_match.group(0).split()).title()

    name_match = re.search(r"\b(?:Prof\.|Dr\.|Mr\.|Mrs\.|Ms\.|Ven\.)\s+(?:(?:[A-Z]\.\s*)+[A-Z][a-z]+|(?:[A-Z][a-z]+\s+)+[A-Z][a-z]+|[A-Z][a-z]+)", clean_text, re.IGNORECASE)
    raw_extracted_name = name_match.group(0).strip() if name_match else "Name Check Required"
    final_clean_name = clean_extracted_name(raw_extracted_name)

    phone_match = PHONE_PATTERN.search(clean_text)
    phone = phone_match.group(0) if phone_match else "N/A"

    return final_clean_name, designation, phone

def get_profile_links_from_directory(directory_url):
    """Scans the directory layout page to identify deep individual profile links universally."""
    profile_links = set()
    try:
        response = session.get(directory_url, timeout=15)
        if response.status_code != 200:
            print(f"[-] Directory connection rejected. Status Code: {response.status_code}")
            return []
        soup = BeautifulSoup(response.text, "html.parser")

        parsed_directory = urllib.parse.urlparse(directory_url)
        dir_parts = parsed_directory.netloc.replace("www.", "").split(".")
        uni_root_boundary = ".".join(dir_parts[-3:]) if "ac.lk" in directory_url else ".".join(dir_parts[-2:])

        for anchor in soup.find_all("a", href=True):
            href = anchor["href"].strip()
            if not href or href in ["#", "/", "index.php", "index.html", "home"]:
                continue

            full_url = urllib.parse.urljoin(directory_url, href)
            url_lower = full_url.lower()
            parsed_sublink = urllib.parse.urlparse(full_url)
            sublink_path = parsed_sublink.path.rstrip("/")

            if uni_root_boundary in parsed_sublink.netloc:
                if not sublink_path or sublink_path in ["", "/index.php"]:
                    continue
                if full_url != directory_url and full_url.rstrip('/') != directory_url.rstrip('/'):
                    if any(kw in url_lower for kw in BLACKLIST_KEYWORDS):
                        continue
                    if not any(x in url_lower for x in ['.pdf', '.jpg', '.png', '.zip', 'contact', 'about', 'lms']):
                        profile_links.add(full_url)
    except Exception as e:
        print(f"[-] Error collecting profile links: {e}")
    return list(profile_links)

def extract_contacts_deep(url):
    """Tries surface extraction first. If no multiple layout details are found, crawls the profile system."""
    records = []
    try:
        response = session.get(url, timeout=12)
        if response.status_code != 200:
            print(f"[-] Profile link rejected by server. Status Code: {response.status_code}")
            return records
        soup = BeautifulSoup(response.text, "html.parser")
        raw_text = soup.get_text(separator=" ")
    except Exception as e:
        print(f"[-] Connection failed to {url}: {e}")
        return records

    surface_emails = list(set(EMAIL_PATTERN.findall(raw_text)))

    # CASE 1: The page is a shared dynamic master table containing everyone's details
    if len(surface_emails) > 3:
        print(f"[+] Multiple emails found directly on landing directory page: {url}")
        for email in surface_emails:
            email_pos = raw_text.find(email)
            snippet = " ".join(raw_text[max(0, email_pos - 250):min(len(raw_text), email_pos + 250)].split())
            name, designation, phone = parse_details_from_snippet(snippet)
            records.append({
                "Name": name,
                "Email": email,
                "Designation": designation,
                "Phone": phone,
                "Profile URL": url
            })

    # CASE 2: The page is an individual personal profile biography path
    else:
        name = discover_name_on_profile(soup)
        designation = discover_designation_on_profile(soup)
        email = surface_emails[0] if surface_emails else "N/A"
        phone_match = PHONE_PATTERN.search(raw_text)
        phone = phone_match.group(0) if phone_match else "N/A"

        email = surface_emails[0] if surface_emails else "N/A"

        if name != "Name Check Required" or email != "N/A":
            # Mapped explicitly to avoid structural data shifting
            records.append({
                "Name": name,
                "Email": email,
                "Designation": designation,
                "Phone": phone,
                "Profile URL": url
            })

    return records


# --- RUNTIME EXECUTION CONTROLLER ---
if __name__ == "__main__":
    # Check if a URL was passed directly in the terminal invocation
    if len(sys.argv) > 1:
        TARGET_DIRECTORY = sys.argv[1].strip()
    else:
        print("💡 Tip: Next time, pass the URL straight into the command: python file.py <URL>")
        TARGET_DIRECTORY = input("Please paste the TARGET_DIRECTORY URL and press Enter: ").strip()

    # Absolute verification pattern checks
    if not TARGET_DIRECTORY.startswith(("http://", "https://")):
        print("[-] Error: Invalid URL layout. Please verify it starts with http:// or https://")
        sys.exit(1)

    print(f"\n[*] Commencing discovery pipeline on: {TARGET_DIRECTORY}")
    profile_urls = get_profile_links_from_directory(TARGET_DIRECTORY)
    print(f"[+] Identified {len(profile_urls)} potential faculty profile paths.")

    all_extracted_faculty = []

    # Process isolated routes sequentially

    for i, profile_url in enumerate(profile_urls, start=1):
        print(f"[{i}/{len(profile_urls)}] Processing: {profile_url}")
        results = extract_contacts_deep(profile_url)

        if results:
            all_extracted_faculty.extend(results)

        # Export structural details safely
        if all_extracted_faculty:
            df = pd.DataFrame(all_extracted_faculty)
            # Drop duplicates across clean matrix overlaps
            df.drop_duplicates(subset=['Email', 'Name'], keep='first', inplace=True)    

            # --- THE EXACT LOCATION WHERE COLUMN ORDER IS CHANGED ---
            # Re-indexing the layout list shifts columns instantly
            desired_order = ["Name", "Email", "Designation", "Phone", "Profile URL"]
            df = df[desired_order]
            # ---------------------------------------------------------

            # Clean naming module isolates domain strings
            domain = urllib.parse.urlparse(TARGET_DIRECTORY).netloc.replace("www.", "")
            output_file = f"extracted_faculty_{domain}.csv"
            df.to_csv(output_file, index=False)
            print(f"\n[Done] Pipeline saved {len(df)} records safely inside '{output_file}'!")
        else:
            print("\n[-] Pipeline closed. No contact cards could be mapped successfully.")