You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
38 lines
876 B
38 lines
876 B
# __main__.py
|
|
import sys
|
|
import pandas as pd
|
|
import time
|
|
from crawler.scraper import fetch_and_parse
|
|
from crawler.utils import log
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 3:
|
|
print("Usage: python -m crawler <urls.txt> <output.xlsx>")
|
|
sys.exit(1)
|
|
|
|
url_file = sys.argv[1]
|
|
output_file = sys.argv[2]
|
|
|
|
log.info(f"Lade URL-Liste aus {url_file}")
|
|
|
|
with open(url_file, "r", encoding="utf-8") as f:
|
|
urls = [line.strip() for line in f if line.strip()]
|
|
|
|
results = []
|
|
|
|
for url in urls:
|
|
try:
|
|
data = fetch_and_parse(url)
|
|
results.append(data)
|
|
except Exception as e:
|
|
log.error(f"Fehler bei {url}: {e}")
|
|
time.sleep(1)
|
|
|
|
df = pd.DataFrame(results)
|
|
df.to_excel(output_file, index=False)
|
|
|
|
log.info(f"Fertig! Datei gespeichert: {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|