From 89ad769b4475cea40726cd87f0c7363fb6157eb6 Mon Sep 17 00:00:00 2001
From: Thomas Baer <thomas.baer@slub-dresden.de>
Date: Wed, 18 Sep 2024 11:08:35 +0200
Subject: [PATCH] Balken

---
 extract_logs.py  | 26 ++++++++++++++++----------
 requirements.txt |  3 ++-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/extract_logs.py b/extract_logs.py
index 2e5b06f..a61b7c2 100644
--- a/extract_logs.py
+++ b/extract_logs.py
@@ -5,6 +5,7 @@ import pandas as pd
 import concurrent.futures
 import pyarrow as pa
 import pyarrow.parquet as pq
+from tqdm import tqdm
 
 
 def clean_query(query_string):
@@ -73,18 +74,23 @@ def extract_urls_from_logs():
 
     total_urls = 0
     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
-        future_to_file = {
+        futures = {
             executor.submit(process_file, file): file for file in files_to_process
         }
-        for future in concurrent.futures.as_completed(future_to_file):
-            file = future_to_file[future]
-            try:
-                df = future.result()
-                total_urls += len(df)
-                table = pa.Table.from_pandas(df)
-                writer.write_table(table)
-            except Exception as exc:
-                print(f"{file} generated an exception: {exc}")
+
+        # Erstelle einen Fortschrittsbalken für die Gesamtanzahl der Dateien
+        with tqdm(total=len(files_to_process), desc="Gesamtfortschritt") as pbar:
+            for future in concurrent.futures.as_completed(futures):
+                file = futures[future]
+                try:
+                    df = future.result()
+                    total_urls += len(df)
+                    table = pa.Table.from_pandas(df)
+                    writer.write_table(table)
+                    pbar.update(1)
+                    pbar.set_postfix({"Datei": os.path.basename(file), "URLs": len(df)})
+                except Exception as exc:
+                    print(f"{file} generierte eine Ausnahme: {exc}")
 
     writer.close()
     return total_urls
diff --git a/requirements.txt b/requirements.txt
index c6c0750..728671b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 locust
 locust-plugins
 pandas
-pyarrow
\ No newline at end of file
+pyarrow
+tqdm
\ No newline at end of file
-- 
GitLab