# extractor.py
import re
import string
from bs4 import BeautifulSoup

STOP_WORDS = [
    "Impressum",
    "Datenschutz",
    "Privacy",
    "AGB",
    "Terms",
    "Kontakt",
    "Kontaktformular"
]

def clean_printable(text):
    printable = set(string.printable + "äöüÄÖÜß€–—„“‚‘…")
    return "".join(ch for ch in text if ch in printable)


# ---------------------------------------------------------
# MAIL + WEBSITE NUR aus exhibitors_profile_grid_pos4
# ---------------------------------------------------------
def extract_mailto_and_website_html(html):
    soup = BeautifulSoup(html, "html.parser")

    mailto = ""
    website = ""

    contact_li = soup.find("li", class_="exhibitors_profile_grid_pos4")
    if not contact_li:
        return mailto, website

    for a in contact_li.find_all("a", href=True):
        href = a["href"].strip()

        if href.startswith("mailto:"):
            mailto = href.replace("mailto:", "").strip()

        elif href.startswith("http"):
            website = href.strip()

    return mailto, website


# ---------------------------------------------------------
# PROFIL NUR aus <h3>Unternehmensprofil</h3> + folgendem <p>
# ---------------------------------------------------------
def extract_profile_from_html(html):
    soup = BeautifulSoup(html, "html.parser")

    h3 = soup.find("h3", string=lambda t: t and "Unternehmensprofil" in t)
    if not h3:
        return ""

    p = h3.find_next("p")
    if not p:
        return ""

    # <br> → Zeilenumbrüche
    for br in p.find_all("br"):
        br.replace_with("\n")

    text = p.get_text("\n").strip()
    return clean_printable(text)


# ---------------------------------------------------------
# HAUPTFUNKTION
# ---------------------------------------------------------
def extract_structured_data(html, url):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text("\n")

    lines = [l.strip() for l in text.split("\n") if l.strip()]

    name = street = ort = kontakt = mail = telefon = ""
    mailto = website = ""
    profil = ""

    # --- Kontaktdaten ---
    try:
        idx = lines.index("Kontaktdaten")

        name = lines[idx + 1]
        street = lines[idx + 2]
        ort = lines[idx + 3]

        kontakt = lines[idx + 5]
        mail = lines[idx + 6]
        telefon = lines[idx + 7]

    except Exception as e:
        raise ValueError(f"Kontaktdaten unvollständig: {e}")

    # --- Mailto + Website aus HTML ---
    mailto, website = extract_mailto_and_website_html(html)

    # --- Profil aus HTML ---
    profil = extract_profile_from_html(html)

    return {
        "Link": url,          # nur Text
        "Name": name,
        "Straße": street,
        "Ort": ort,
        "Kontakt": kontakt,
        "Mail": mail,
        "Telefon": telefon,
        "mailto": mailto,     # nur Text
        "Website": website,   # nur Text
        "Profil": profil
    }