diff --git a/extract_logs.py b/extract_logs.py index d1971fd4f47157fef5f9ab0a7b3e2733a0c5ee00..502797077e7e9f318b278745ae5002a1a2fd6f68 100644 --- a/extract_logs.py +++ b/extract_logs.py @@ -2,7 +2,6 @@ import os import re from urllib.parse import parse_qs, urlencode import pandas as pd -import concurrent.futures import pyarrow as pa import pyarrow.parquet as pq from tqdm import tqdm @@ -34,7 +33,7 @@ def process_file(filename): filename (str): Der Pfad zur zu verarbeitenden Log-Datei. Returns: - pandas.DataFrame: Ein DataFrame mit den einzigartigen, extrahierten URLs. + set: Ein Set mit den einzigartigen, extrahierten URLs. """ unique_urls = set() with open(filename, "r") as file: @@ -48,7 +47,7 @@ def process_file(filename): clean_query_string ): # Nur hinzufügen, wenn es relevante Parameter gibt unique_urls.add(f"/select?{clean_query_string}") - return pd.DataFrame(list(unique_urls), columns=["url"]) + return unique_urls def extract_urls_from_logs(): @@ -77,33 +76,27 @@ def extract_urls_from_logs(): writer = pq.ParquetWriter(output_file, schema, compression="snappy") all_unique_urls = set() - with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: - futures = { - executor.submit(process_file, file): file for file in files_to_process - } - - # Erstelle einen Fortschrittsbalken für die Gesamtanzahl der Dateien - with tqdm(total=len(files_to_process), desc="Gesamtfortschritt") as pbar: - for future in concurrent.futures.as_completed(futures): - file = futures[future] - try: - df = future.result() - new_unique_urls = set(df["url"]) - all_unique_urls - all_unique_urls.update(new_unique_urls) - if new_unique_urls: - table = pa.Table.from_pandas( - pd.DataFrame(list(new_unique_urls), columns=["url"]) - ) - writer.write_table(table) - pbar.update(1) - pbar.set_postfix( - { - "Datei": os.path.basename(file), - "Neue einzigartige URLs": len(new_unique_urls), - } + + # Erstelle einen Fortschrittsbalken für die Gesamtanzahl der Dateien + with tqdm(total=len(files_to_process), desc="Gesamtfortschritt") as pbar: + for file in files_to_process: + try: + new_unique_urls = process_file(file) - all_unique_urls + all_unique_urls.update(new_unique_urls) + if new_unique_urls: + table = pa.Table.from_pandas( + pd.DataFrame(list(new_unique_urls), columns=["url"]) ) - except Exception as exc: - print(f"{file} generierte eine Ausnahme: {exc}") + writer.write_table(table) + pbar.update(1) + pbar.set_postfix( + { + "Datei": os.path.basename(file), + "Neue einzigartige URLs": len(new_unique_urls), + } + ) + except Exception as exc: + print(f"{file} generierte eine Ausnahme: {exc}") writer.close() return len(all_unique_urls)