You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
113 lines
2.9 KiB
113 lines
2.9 KiB
# extractor.py
|
|
import re
|
|
import string
|
|
from bs4 import BeautifulSoup
|
|
|
|
STOP_WORDS = [
|
|
"Impressum",
|
|
"Datenschutz",
|
|
"Privacy",
|
|
"AGB",
|
|
"Terms",
|
|
"Kontakt",
|
|
"Kontaktformular"
|
|
]
|
|
|
|
def clean_printable(text):
|
|
printable = set(string.printable + "äöüÄÖÜ߀–—„“‚‘…")
|
|
return "".join(ch for ch in text if ch in printable)
|
|
|
|
|
|
# ---------------------------------------------------------
|
|
# MAIL + WEBSITE NUR aus exhibitors_profile_grid_pos4
|
|
# ---------------------------------------------------------
|
|
def extract_mailto_and_website_html(html):
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
mailto = ""
|
|
website = ""
|
|
|
|
contact_li = soup.find("li", class_="exhibitors_profile_grid_pos4")
|
|
if not contact_li:
|
|
return mailto, website
|
|
|
|
for a in contact_li.find_all("a", href=True):
|
|
href = a["href"].strip()
|
|
|
|
if href.startswith("mailto:"):
|
|
mailto = href.replace("mailto:", "").strip()
|
|
|
|
elif href.startswith("http"):
|
|
website = href.strip()
|
|
|
|
return mailto, website
|
|
|
|
|
|
# ---------------------------------------------------------
|
|
# PROFIL NUR aus <h3>Unternehmensprofil</h3> + folgendem <p>
|
|
# ---------------------------------------------------------
|
|
def extract_profile_from_html(html):
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
h3 = soup.find("h3", string=lambda t: t and "Unternehmensprofil" in t)
|
|
if not h3:
|
|
return ""
|
|
|
|
p = h3.find_next("p")
|
|
if not p:
|
|
return ""
|
|
|
|
# <br> → Zeilenumbrüche
|
|
for br in p.find_all("br"):
|
|
br.replace_with("\n")
|
|
|
|
text = p.get_text("\n").strip()
|
|
return clean_printable(text)
|
|
|
|
|
|
# ---------------------------------------------------------
|
|
# HAUPTFUNKTION
|
|
# ---------------------------------------------------------
|
|
def extract_structured_data(html, url):
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
text = soup.get_text("\n")
|
|
|
|
lines = [l.strip() for l in text.split("\n") if l.strip()]
|
|
|
|
name = street = ort = kontakt = mail = telefon = ""
|
|
mailto = website = ""
|
|
profil = ""
|
|
|
|
# --- Kontaktdaten ---
|
|
try:
|
|
idx = lines.index("Kontaktdaten")
|
|
|
|
name = lines[idx + 1]
|
|
street = lines[idx + 2]
|
|
ort = lines[idx + 3]
|
|
|
|
kontakt = lines[idx + 5]
|
|
mail = lines[idx + 6]
|
|
telefon = lines[idx + 7]
|
|
|
|
except Exception as e:
|
|
raise ValueError(f"Kontaktdaten unvollständig: {e}")
|
|
|
|
# --- Mailto + Website aus HTML ---
|
|
mailto, website = extract_mailto_and_website_html(html)
|
|
|
|
# --- Profil aus HTML ---
|
|
profil = extract_profile_from_html(html)
|
|
|
|
return {
|
|
"Link": url, # nur Text
|
|
"Name": name,
|
|
"Straße": street,
|
|
"Ort": ort,
|
|
"Kontakt": kontakt,
|
|
"Mail": mail,
|
|
"Telefon": telefon,
|
|
"mailto": mailto, # nur Text
|
|
"Website": website, # nur Text
|
|
"Profil": profil
|
|
}
|