# extractor.py import re import string from bs4 import BeautifulSoup STOP_WORDS = [ "Impressum", "Datenschutz", "Privacy", "AGB", "Terms", "Kontakt", "Kontaktformular" ] def clean_printable(text): printable = set(string.printable + "äöüÄÖÜ߀–—„“‚‘…") return "".join(ch for ch in text if ch in printable) # --------------------------------------------------------- # MAIL + WEBSITE NUR aus exhibitors_profile_grid_pos4 # --------------------------------------------------------- def extract_mailto_and_website_html(html): soup = BeautifulSoup(html, "html.parser") mailto = "" website = "" contact_li = soup.find("li", class_="exhibitors_profile_grid_pos4") if not contact_li: return mailto, website for a in contact_li.find_all("a", href=True): href = a["href"].strip() if href.startswith("mailto:"): mailto = href.replace("mailto:", "").strip() elif href.startswith("http"): website = href.strip() return mailto, website # --------------------------------------------------------- # PROFIL NUR aus
# ---------------------------------------------------------
def extract_profile_from_html(html):
soup = BeautifulSoup(html, "html.parser")
h3 = soup.find("h3", string=lambda t: t and "Unternehmensprofil" in t)
if not h3:
return ""
p = h3.find_next("p")
if not p:
return ""
#
→ Zeilenumbrüche
for br in p.find_all("br"):
br.replace_with("\n")
text = p.get_text("\n").strip()
return clean_printable(text)
# ---------------------------------------------------------
# HAUPTFUNKTION
# ---------------------------------------------------------
def extract_structured_data(html, url):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text("\n")
lines = [l.strip() for l in text.split("\n") if l.strip()]
name = street = ort = kontakt = mail = telefon = ""
mailto = website = ""
profil = ""
# --- Kontaktdaten ---
try:
idx = lines.index("Kontaktdaten")
name = lines[idx + 1]
street = lines[idx + 2]
ort = lines[idx + 3]
kontakt = lines[idx + 5]
mail = lines[idx + 6]
telefon = lines[idx + 7]
except Exception as e:
raise ValueError(f"Kontaktdaten unvollständig: {e}")
# --- Mailto + Website aus HTML ---
mailto, website = extract_mailto_and_website_html(html)
# --- Profil aus HTML ---
profil = extract_profile_from_html(html)
return {
"Link": url, # nur Text
"Name": name,
"Straße": street,
"Ort": ort,
"Kontakt": kontakt,
"Mail": mail,
"Telefon": telefon,
"mailto": mailto, # nur Text
"Website": website, # nur Text
"Profil": profil
}