diff --git a/extract_logs.py b/extract_logs.py index 2e5b06fe30047b5a391d9ff5b6cd4685924d1bf9..a61b7c2cba2c9ac020b7fe02d6319fb88056547a 100644 --- a/extract_logs.py +++ b/extract_logs.py @@ -5,6 +5,7 @@ import pandas as pd import concurrent.futures import pyarrow as pa import pyarrow.parquet as pq +from tqdm import tqdm def clean_query(query_string): @@ -73,18 +74,23 @@ def extract_urls_from_logs(): total_urls = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: - future_to_file = { + futures = { executor.submit(process_file, file): file for file in files_to_process } - for future in concurrent.futures.as_completed(future_to_file): - file = future_to_file[future] - try: - df = future.result() - total_urls += len(df) - table = pa.Table.from_pandas(df) - writer.write_table(table) - except Exception as exc: - print(f"{file} generated an exception: {exc}") + + # Erstelle einen Fortschrittsbalken für die Gesamtanzahl der Dateien + with tqdm(total=len(files_to_process), desc="Gesamtfortschritt") as pbar: + for future in concurrent.futures.as_completed(futures): + file = futures[future] + try: + df = future.result() + total_urls += len(df) + table = pa.Table.from_pandas(df) + writer.write_table(table) + pbar.update(1) + pbar.set_postfix({"Datei": os.path.basename(file), "URLs": len(df)}) + except Exception as exc: + print(f"{file} generierte eine Ausnahme: {exc}") writer.close() return total_urls diff --git a/requirements.txt b/requirements.txt index c6c07501b5d7577990624036c3e2558dec93df4f..728671b97df8631a18be750dd79463327a55caab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ locust locust-plugins pandas -pyarrow \ No newline at end of file +pyarrow +tqdm \ No newline at end of file