Crawler/crawler/extractor.py


								# extractor.py

								import re

								import string

								from bs4 import BeautifulSoup


								STOP_WORDS = [

								    "Impressum",

								    "Datenschutz",

								    "Privacy",

								    "AGB",

								    "Terms",

								    "Kontakt",

								    "Kontaktformular"

								]


								def clean_printable(text):

								    printable = set(string.printable + "äöüÄÖÜß€–—„“‚‘…")

								    return "".join(ch for ch in text if ch in printable)


								# ---------------------------------------------------------

								# MAIL + WEBSITE NUR aus exhibitors_profile_grid_pos4

								# ---------------------------------------------------------

								def extract_mailto_and_website_html(html):

								    soup = BeautifulSoup(html, "html.parser")


								    mailto = ""

								    website = ""


								    contact_li = soup.find("li", class_="exhibitors_profile_grid_pos4")

								    if not contact_li:

								        return mailto, website


								    for a in contact_li.find_all("a", href=True):

								        href = a["href"].strip()


								        if href.startswith("mailto:"):

								            mailto = href.replace("mailto:", "").strip()


								        elif href.startswith("http"):

								            website = href.strip()


								    return mailto, website


								# ---------------------------------------------------------

								# PROFIL NUR aus <h3>Unternehmensprofil</h3> + folgendem <p>

								# ---------------------------------------------------------

								def extract_profile_from_html(html):

								    soup = BeautifulSoup(html, "html.parser")


								    h3 = soup.find("h3", string=lambda t: t and "Unternehmensprofil" in t)

								    if not h3:

								        return ""


								    p = h3.find_next("p")

								    if not p:

								        return ""


								    # <br> → Zeilenumbrüche

								    for br in p.find_all("br"):

								        br.replace_with("\n")


								    text = p.get_text("\n").strip()

								    return clean_printable(text)


								# ---------------------------------------------------------

								# HAUPTFUNKTION

								# ---------------------------------------------------------

								def extract_structured_data(html, url):

								    soup = BeautifulSoup(html, "html.parser")

								    text = soup.get_text("\n")


								    lines = [l.strip() for l in text.split("\n") if l.strip()]


								    name = street = ort = kontakt = mail = telefon = ""

								    mailto = website = ""

								    profil = ""


								    # --- Kontaktdaten ---

								    try:

								        idx = lines.index("Kontaktdaten")


								        name = lines[idx + 1]

								        street = lines[idx + 2]

								        ort = lines[idx + 3]


								        kontakt = lines[idx + 5]

								        mail = lines[idx + 6]

								        telefon = lines[idx + 7]


								    except Exception as e:

								        raise ValueError(f"Kontaktdaten unvollständig: {e}")


								    # --- Mailto + Website aus HTML ---

								    mailto, website = extract_mailto_and_website_html(html)


								    # --- Profil aus HTML ---

								    profil = extract_profile_from_html(html)


								    return {

								        "Link": url,          # nur Text

								        "Name": name,

								        "Straße": street,

								        "Ort": ort,

								        "Kontakt": kontakt,

								        "Mail": mail,

								        "Telefon": telefon,

								        "mailto": mailto,     # nur Text

								        "Website": website,   # nur Text

								        "Profil": profil

								    }