You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
28 lines
671 B
28 lines
671 B
# scraper.py
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from crawler.extractor import extract_structured_data
|
|
from crawler.utils import log
|
|
|
|
|
|
def fetch_and_parse(url):
|
|
log.info(f"Abruf: {url}")
|
|
|
|
headers = {"User-Agent": "Mozilla/5.0"}
|
|
|
|
try:
|
|
r = requests.get(url, headers=headers, timeout=10)
|
|
r.raise_for_status()
|
|
except Exception as e:
|
|
log.error(f"Fehler beim Abruf von {url}: {e}")
|
|
raise
|
|
|
|
# WICHTIG: HTML an Extractor übergeben, NICHT Text!
|
|
html = r.text
|
|
|
|
try:
|
|
return extract_structured_data(html, url)
|
|
except Exception as e:
|
|
log.error(f"Extraktionsfehler bei {url}: {e}")
|
|
raise
|