You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

113 lines
2.9 KiB

# extractor.py
import re
import string
from bs4 import BeautifulSoup
STOP_WORDS = [
"Impressum",
"Datenschutz",
"Privacy",
"AGB",
"Terms",
"Kontakt",
"Kontaktformular"
]
def clean_printable(text):
printable = set(string.printable + "äöüÄÖÜ߀–—„“‚‘…")
return "".join(ch for ch in text if ch in printable)
# ---------------------------------------------------------
# MAIL + WEBSITE NUR aus exhibitors_profile_grid_pos4
# ---------------------------------------------------------
def extract_mailto_and_website_html(html):
soup = BeautifulSoup(html, "html.parser")
mailto = ""
website = ""
contact_li = soup.find("li", class_="exhibitors_profile_grid_pos4")
if not contact_li:
return mailto, website
for a in contact_li.find_all("a", href=True):
href = a["href"].strip()
if href.startswith("mailto:"):
mailto = href.replace("mailto:", "").strip()
elif href.startswith("http"):
website = href.strip()
return mailto, website
# ---------------------------------------------------------
# PROFIL NUR aus <h3>Unternehmensprofil</h3> + folgendem <p>
# ---------------------------------------------------------
def extract_profile_from_html(html):
soup = BeautifulSoup(html, "html.parser")
h3 = soup.find("h3", string=lambda t: t and "Unternehmensprofil" in t)
if not h3:
return ""
p = h3.find_next("p")
if not p:
return ""
# <br> → Zeilenumbrüche
for br in p.find_all("br"):
br.replace_with("\n")
text = p.get_text("\n").strip()
return clean_printable(text)
# ---------------------------------------------------------
# HAUPTFUNKTION
# ---------------------------------------------------------
def extract_structured_data(html, url):
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text("\n")
lines = [l.strip() for l in text.split("\n") if l.strip()]
name = street = ort = kontakt = mail = telefon = ""
mailto = website = ""
profil = ""
# --- Kontaktdaten ---
try:
idx = lines.index("Kontaktdaten")
name = lines[idx + 1]
street = lines[idx + 2]
ort = lines[idx + 3]
kontakt = lines[idx + 5]
mail = lines[idx + 6]
telefon = lines[idx + 7]
except Exception as e:
raise ValueError(f"Kontaktdaten unvollständig: {e}")
# --- Mailto + Website aus HTML ---
mailto, website = extract_mailto_and_website_html(html)
# --- Profil aus HTML ---
profil = extract_profile_from_html(html)
return {
"Link": url, # nur Text
"Name": name,
"Straße": street,
"Ort": ort,
"Kontakt": kontakt,
"Mail": mail,
"Telefon": telefon,
"mailto": mailto, # nur Text
"Website": website, # nur Text
"Profil": profil
}