# extractor.py import re import string from bs4 import BeautifulSoup STOP_WORDS = [ "Impressum", "Datenschutz", "Privacy", "AGB", "Terms", "Kontakt", "Kontaktformular" ] def clean_printable(text): printable = set(string.printable + "äöüÄÖÜ߀–—„“‚‘…") return "".join(ch for ch in text if ch in printable) # --------------------------------------------------------- # MAIL + WEBSITE NUR aus exhibitors_profile_grid_pos4 # --------------------------------------------------------- def extract_mailto_and_website_html(html): soup = BeautifulSoup(html, "html.parser") mailto = "" website = "" contact_li = soup.find("li", class_="exhibitors_profile_grid_pos4") if not contact_li: return mailto, website for a in contact_li.find_all("a", href=True): href = a["href"].strip() if href.startswith("mailto:"): mailto = href.replace("mailto:", "").strip() elif href.startswith("http"): website = href.strip() return mailto, website # --------------------------------------------------------- # PROFIL NUR aus

Unternehmensprofil

+ folgendem

# --------------------------------------------------------- def extract_profile_from_html(html): soup = BeautifulSoup(html, "html.parser") h3 = soup.find("h3", string=lambda t: t and "Unternehmensprofil" in t) if not h3: return "" p = h3.find_next("p") if not p: return "" #
→ Zeilenumbrüche for br in p.find_all("br"): br.replace_with("\n") text = p.get_text("\n").strip() return clean_printable(text) # --------------------------------------------------------- # HAUPTFUNKTION # --------------------------------------------------------- def extract_structured_data(html, url): soup = BeautifulSoup(html, "html.parser") text = soup.get_text("\n") lines = [l.strip() for l in text.split("\n") if l.strip()] name = street = ort = kontakt = mail = telefon = "" mailto = website = "" profil = "" # --- Kontaktdaten --- try: idx = lines.index("Kontaktdaten") name = lines[idx + 1] street = lines[idx + 2] ort = lines[idx + 3] kontakt = lines[idx + 5] mail = lines[idx + 6] telefon = lines[idx + 7] except Exception as e: raise ValueError(f"Kontaktdaten unvollständig: {e}") # --- Mailto + Website aus HTML --- mailto, website = extract_mailto_and_website_html(html) # --- Profil aus HTML --- profil = extract_profile_from_html(html) return { "Link": url, # nur Text "Name": name, "Straße": street, "Ort": ort, "Kontakt": kontakt, "Mail": mail, "Telefon": telefon, "mailto": mailto, # nur Text "Website": website, # nur Text "Profil": profil }