From 89ad769b4475cea40726cd87f0c7363fb6157eb6 Mon Sep 17 00:00:00 2001 From: Thomas Baer <thomas.baer@slub-dresden.de> Date: Wed, 18 Sep 2024 11:08:35 +0200 Subject: [PATCH] Balken --- extract_logs.py | 26 ++++++++++++++++---------- requirements.txt | 3 ++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/extract_logs.py b/extract_logs.py index 2e5b06f..a61b7c2 100644 --- a/extract_logs.py +++ b/extract_logs.py @@ -5,6 +5,7 @@ import pandas as pd import concurrent.futures import pyarrow as pa import pyarrow.parquet as pq +from tqdm import tqdm def clean_query(query_string): @@ -73,18 +74,23 @@ def extract_urls_from_logs(): total_urls = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: - future_to_file = { + futures = { executor.submit(process_file, file): file for file in files_to_process } - for future in concurrent.futures.as_completed(future_to_file): - file = future_to_file[future] - try: - df = future.result() - total_urls += len(df) - table = pa.Table.from_pandas(df) - writer.write_table(table) - except Exception as exc: - print(f"{file} generated an exception: {exc}") + + # Erstelle einen Fortschrittsbalken für die Gesamtanzahl der Dateien + with tqdm(total=len(files_to_process), desc="Gesamtfortschritt") as pbar: + for future in concurrent.futures.as_completed(futures): + file = futures[future] + try: + df = future.result() + total_urls += len(df) + table = pa.Table.from_pandas(df) + writer.write_table(table) + pbar.update(1) + pbar.set_postfix({"Datei": os.path.basename(file), "URLs": len(df)}) + except Exception as exc: + print(f"{file} generierte eine Ausnahme: {exc}") writer.close() return total_urls diff --git a/requirements.txt b/requirements.txt index c6c0750..728671b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ locust locust-plugins pandas -pyarrow \ No newline at end of file +pyarrow +tqdm \ No newline at end of file -- GitLab