Skip to content

Name & email Extract

note that - still building this code as universal for all type of web pages to extract the name and email from the page

  • this is working for many type of links

import os
import re
import sys
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import urllib3

# Suppress messy InsecureRequestWarning warnings in terminal screen
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Robust patterns for global email and phone discovery
EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
PHONE_PATTERN = re.compile(r"(?:\+94|0)[1-9][0-9]\s?[- ]?[0-9]{3}\s?[- ]?[0-9]{4}")

# Loose Title matching pattern to clean titles out of names if they bleed in
DESIGNATION_CLEAN_PATTERN = re.compile(
    r"\b(Senior Professor|Senior Lecturer|Assistant Lecturer|Professor|Lecturer|Instructor|Head|Dean|HOD|Probationary|Grade I|Grade II|II|I|Visiting|Chair)\b",
    re.IGNORECASE
)

# Skips pages that do not host individual personnel profiles
BLACKLIST_KEYWORDS = [
    'accreditation', 'scholarship', 'schedule', 'guidelines', 'structure', 'research', 'mile-stone', 'grants', 
    'partners-in-leaning','student-handbook', 'news', 'programme', 'internship', 'dissertation', 'exemption', 
    'calendar-of-dates', 'heads-of-the-department','library','student-resources','msu', 'curriculum', 'syllabus', 
    'fee', 'exam', 'admission', 'course','activity','ethics-review-committee','iqac','alumni','video-resources',
    'history', 'gallery', 'event', 'notice', 'download', 'publication'
]

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1"
}

# Persistent network session to speed up sequential loop downloads
session = requests.Session()
session.headers.update(HEADERS)
session.verify = False  # Global bypass for local university SSL configuration errors

def clean_extracted_name(name_str):
    """Removes any academic designation keywords that leaked into the name string."""
    if not name_str:
        return "Name Check Required"
    name_str = DESIGNATION_CLEAN_PATTERN.sub("", name_str)
    name_str = re.sub(r'^\s*[-–—,•|:]\s*|\s*[-–—,•|:]\s*\$', '', name_str)
    return " ".join(name_str.split()).strip()

def discover_name_on_profile(soup):
    """Finds the person's name on their profile page by checking top-level headings or title."""
    for tag_name in ['h1', 'h2', 'h3']:
        for heading in soup.find_all(tag_name):
            text = heading.get_text().strip()
            if any(title in text.lower() for title in ['prof', 'dr.', 'mr.', 'ms.', 'mrs.']):
                return clean_extracted_name(text)

    if soup.title and soup.title.string:
        try:
            # Iteratively processes delimiters to split safely
            title_text = soup.title.string
            for delimiter in ['|', '-']:
                if delimiter in title_text:
                    # Extracts the first contextual block before the divider safely
                    title_text = title_text.split(delimiter)[0]
            title_text = title_text.strip()

            if any(title in title_text.lower() for title in ['prof', 'dr.', 'mr.', 'ms.', 'mrs.', 'staff']):
                return clean_extracted_name(title_text.replace('Staff', '').replace('Profile', ''))
        except Exception:
            pass

    return "Name Check Required"

def discover_designation_on_profile(soup):
    """Scans the profile text elements to classify their exact academic rank."""
    page_text = soup.get_text(separator=" ")

    leader_match = re.search(r'\b(Head of Department|Head|HOD|Dean)\b', page_text, re.IGNORECASE)
    if leader_match:
        val = leader_match.group(0).lower()
        return "HOD" if val == 'hod' else leader_match.group(0).title()

    rank_match = re.search(r'\b(Senior Professor|Senior Lecturer|Assistant Lecturer|Lecturer|Professor)\s*(?:Grade\s*[IVX12i]+|[IVX12i]+|(Confirmed)|(Probationary))?\b', page_text, re.IGNORECASE)
    if rank_match:
        return " ".join(rank_match.group(0).split()).title()

    return "Academic Staff"

def parse_details_from_snippet(text):
    """Uses contextual regex patterns to filter designations, phone lines, and formal names."""
    clean_text = " ".join(text.split())

    desig_match = DESIGNATION_CLEAN_PATTERN.search(clean_text)
    designation = desig_match.group(0).strip() if desig_match else "Lecturer / Academic Staff"

    leader_match = re.search(r'\b(Head of Department|Head|HOD|Dean)\b', clean_text, re.IGNORECASE)
    if leader_match:
        val = leader_match.group(0).lower()
        designation = "HOD" if val == 'hod' else leader_match.group(0).title()
    else:
        grade_match = re.search(r'\b(Senior Lecturer|Lecturer|Assistant Lecturer)\s*(?:Grade\s*[IVX12i]+|[IVX12i]+)?\b', clean_text, re.IGNORECASE)
        if grade_match:
            designation = " ".join(grade_match.group(0).split()).title()

    name_match = re.search(r"\b(?:Prof\.|Dr\.|Mr\.|Mrs\.|Ms\.|Ven\.)\s+(?:(?:[A-Z]\.\s*)+[A-Z][a-z]+|(?:[A-Z][a-z]+\s+)+[A-Z][a-z]+|[A-Z][a-z]+)", clean_text, re.IGNORECASE)
    raw_extracted_name = name_match.group(0).strip() if name_match else "Name Check Required"
    final_clean_name = clean_extracted_name(raw_extracted_name)

    phone_match = PHONE_PATTERN.search(clean_text)
    phone = phone_match.group(0) if phone_match else "N/A"

    return final_clean_name, designation, phone

def get_profile_links_from_directory(directory_url):
    """Scans the directory layout page to identify deep individual profile links universally."""
    profile_links = set()
    try:
        response = session.get(directory_url, timeout=15)
        if response.status_code != 200:
            print(f"[-] Directory connection rejected. Status Code: {response.status_code}")
            return []
        soup = BeautifulSoup(response.text, "html.parser")

        parsed_directory = urllib.parse.urlparse(directory_url)
        dir_parts = parsed_directory.netloc.replace("www.", "").split(".")
        uni_root_boundary = ".".join(dir_parts[-3:]) if "ac.lk" in directory_url else ".".join(dir_parts[-2:])

        for anchor in soup.find_all("a", href=True):
            href = anchor["href"].strip()
            if not href or href in ["#", "/", "index.php", "index.html", "home"]:
                continue

            full_url = urllib.parse.urljoin(directory_url, href)
            url_lower = full_url.lower()
            parsed_sublink = urllib.parse.urlparse(full_url)
            sublink_path = parsed_sublink.path.rstrip("/")

            if uni_root_boundary in parsed_sublink.netloc:
                if not sublink_path or sublink_path in ["", "/index.php"]:
                    continue
                if full_url != directory_url and full_url.rstrip('/') != directory_url.rstrip('/'):
                    if any(kw in url_lower for kw in BLACKLIST_KEYWORDS):
                        continue
                    if not any(x in url_lower for x in ['.pdf', '.jpg', '.png', '.zip', 'contact', 'about', 'lms']):
                        profile_links.add(full_url)
    except Exception as e:
        print(f"[-] Error collecting profile links: {e}")
    return list(profile_links)

def extract_contacts_deep(url):
    """Tries surface extraction first. If no multiple layout details are found, crawls the profile system."""
    records = []
    try:
        response = session.get(url, timeout=12)
        if response.status_code != 200:
            print(f"[-] Profile link rejected by server. Status Code: {response.status_code}")
            return records
        soup = BeautifulSoup(response.text, "html.parser")
        raw_text = soup.get_text(separator=" ")
    except Exception as e:
        print(f"[-] Connection failed to {url}: {e}")
        return records

    surface_emails = list(set(EMAIL_PATTERN.findall(raw_text)))

    # CASE 1: The page is a shared dynamic master table containing everyone's details
    if len(surface_emails) > 3:
        print(f"[+] Multiple emails found directly on landing directory page: {url}")
        for email in surface_emails:
            email_pos = raw_text.find(email)
            snippet = " ".join(raw_text[max(0, email_pos - 250):min(len(raw_text), email_pos + 250)].split())
            name, designation, phone = parse_details_from_snippet(snippet)
            records.append({
                "Name": name,
                "Email": email,
                "Designation": designation,
                "Phone": phone,
                "Profile URL": url
            })

    # CASE 2: The page is an individual personal profile biography path
    else:
        name = discover_name_on_profile(soup)
        designation = discover_designation_on_profile(soup)
        email = surface_emails[0] if surface_emails else "N/A"
        phone_match = PHONE_PATTERN.search(raw_text)
        phone = phone_match.group(0) if phone_match else "N/A"

        email = surface_emails[0] if surface_emails else "N/A"

        if name != "Name Check Required" or email != "N/A":
            # Mapped explicitly to avoid structural data shifting
            records.append({
                "Name": name,
                "Email": email,
                "Designation": designation,
                "Phone": phone,
                "Profile URL": url
            })

    return records


# --- RUNTIME EXECUTION CONTROLLER ---
if __name__ == "__main__":
    # Check if a URL was passed directly in the terminal invocation
    if len(sys.argv) > 1:
        TARGET_DIRECTORY = sys.argv[1].strip()
    else:
        print("💡 Tip: Next time, pass the URL straight into the command: python file.py <URL>")
        TARGET_DIRECTORY = input("Please paste the TARGET_DIRECTORY URL and press Enter: ").strip()

    # Absolute verification pattern checks
    if not TARGET_DIRECTORY.startswith(("http://", "https://")):
        print("[-] Error: Invalid URL layout. Please verify it starts with http:// or https://")
        sys.exit(1)

    print(f"\n[*] Commencing discovery pipeline on: {TARGET_DIRECTORY}")
    profile_urls = get_profile_links_from_directory(TARGET_DIRECTORY)
    print(f"[+] Identified {len(profile_urls)} potential faculty profile paths.")

    all_extracted_faculty = []

    # Process isolated routes sequentially

    for i, profile_url in enumerate(profile_urls, start=1):
        print(f"[{i}/{len(profile_urls)}] Processing: {profile_url}")
        results = extract_contacts_deep(profile_url)

        if results:
            all_extracted_faculty.extend(results)

        # Export structural details safely
        if all_extracted_faculty:
            df = pd.DataFrame(all_extracted_faculty)
            # Drop duplicates across clean matrix overlaps
            df.drop_duplicates(subset=['Email', 'Name'], keep='first', inplace=True)    

            # --- THE EXACT LOCATION WHERE COLUMN ORDER IS CHANGED ---
            # Re-indexing the layout list shifts columns instantly
            desired_order = ["Name", "Email", "Designation", "Phone", "Profile URL"]
            df = df[desired_order]
            # ---------------------------------------------------------

            # Clean naming module isolates domain strings
            domain = urllib.parse.urlparse(TARGET_DIRECTORY).netloc.replace("www.", "")
            output_file = f"extracted_faculty_{domain}.csv"
            df.to_csv(output_file, index=False)
            print(f"\n[Done] Pipeline saved {len(df)} records safely inside '{output_file}'!")
        else:
            print("\n[-] Pipeline closed. No contact cards could be mapped successfully.")