From 35cf20076eb2e8041b646e4617e6bde1bf391a5c Mon Sep 17 00:00:00 2001 From: Thomas Baer <thomas.baer@slub-dresden.de> Date: Wed, 18 Sep 2024 10:31:01 +0200 Subject: [PATCH] =?UTF-8?q?=C3=84ndere=20zu=20Parquet-File=20und=20Multith?= =?UTF-8?q?readed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++- extract_logs.py | 70 +++++++++++++++++++++++++----------------------- locustfile.py | 16 +++++------ requirements.txt | 2 ++ 4 files changed, 47 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index b80df18..218e544 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ venv/ -solr_queries.db \ No newline at end of file +solr_queries.db +solr_queries.parquet diff --git a/extract_logs.py b/extract_logs.py index d442712..96d2bf8 100644 --- a/extract_logs.py +++ b/extract_logs.py @@ -1,57 +1,61 @@ import os import re -import sqlite3 from urllib.parse import parse_qs, urlencode +import pandas as pd +import concurrent.futures +import threading def clean_query(query_string): - # Parse die Query-Parameter params = parse_qs(query_string) - - # Entferne unerwünschte Parameter for param in ["shard", "shard.url", "isShard", "NOW", "wt"]: params.pop(param, None) - - # Konvertiere zurück zu einem Query-String return urlencode(params, doseq=True) -def extract_urls_from_logs(): - log_dir = "logs/" - log_pattern = re.compile(r"solr\.log\.\d+") +def process_file(filename): urls = [] + with open(filename, "r") as file: + for line in file: + if "o.a.s.c.S.Request" in line and "params=" in line: + params_match = re.search(r"params=\{(.+?)\}", line) + if params_match: + query_string = params_match.group(1) + clean_query_string = clean_query(query_string) + urls.append(f"/select?{clean_query_string}") + return urls - for filename in os.listdir(log_dir): - if log_pattern.match(filename): - with open(os.path.join(log_dir, filename), "r") as file: - for line in file: - if "o.a.s.c.S.Request" in line and "params=" in line: - params_match = re.search(r"params=\{(.+?)\}", line) - if params_match: - query_string = params_match.group(1) - clean_query_string = clean_query(query_string) - urls.append(f"/select?{clean_query_string}") - return urls +def extract_urls_from_logs(): + log_dir = "logs/" + log_pattern = re.compile(r"solr\.log\.\d+") + all_urls = [] + files_to_process = [ + os.path.join(log_dir, f) for f in os.listdir(log_dir) if log_pattern.match(f) + ] -def save_to_sqlite(urls): - conn = sqlite3.connect("solr_queries.db") - c = conn.cursor() + with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: + future_to_file = { + executor.submit(process_file, file): file for file in files_to_process + } + for future in concurrent.futures.as_completed(future_to_file): + file = future_to_file[future] + try: + urls = future.result() + all_urls.extend(urls) + except Exception as exc: + print(f"{file} generated an exception: {exc}") - # Erstelle die Tabelle, falls sie nicht existiert - c.execute("""CREATE TABLE IF NOT EXISTS queries - (id INTEGER PRIMARY KEY, url TEXT)""") + return all_urls - # Füge die URLs in die Datenbank ein - for url in urls: - c.execute("INSERT INTO queries (url) VALUES (?)", (url,)) - conn.commit() - conn.close() +def save_to_parquet(urls): + df = pd.DataFrame(urls, columns=["url"]) + df.to_parquet("solr_queries.parquet", engine="pyarrow") if __name__ == "__main__": urls = extract_urls_from_logs() - save_to_sqlite(urls) - print(f"{len(urls)} URLs wurden extrahiert und in die Datenbank gespeichert.") + save_to_parquet(urls) + print(f"{len(urls)} URLs wurden extrahiert und in das Parquet-File gespeichert.") diff --git a/locustfile.py b/locustfile.py index c916ade..0e72ce2 100644 --- a/locustfile.py +++ b/locustfile.py @@ -1,4 +1,4 @@ -import sqlite3 +import pandas as pd from locust import HttpUser, task, between import random @@ -7,15 +7,11 @@ class SolrUser(HttpUser): wait_time = between(1, 5) def on_start(self): - self.queries = self.load_queries_from_db() + self.queries = self.load_queries_from_parquet() - def load_queries_from_db(self): - conn = sqlite3.connect("solr_queries.db") - c = conn.cursor() - c.execute("SELECT url FROM queries") - queries = [row[0] for row in c.fetchall()] - conn.close() - return queries + def load_queries_from_parquet(self): + df = pd.read_parquet("solr_queries.parquet") + return df["url"].tolist() @task def search_solr(self): @@ -23,4 +19,4 @@ class SolrUser(HttpUser): query = random.choice(self.queries) self.client.get(query) else: - print("Keine Abfragen in der Datenbank gefunden.") + print("Keine Abfragen im Parquet-File gefunden.") diff --git a/requirements.txt b/requirements.txt index 2161115..c6c0750 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ locust locust-plugins +pandas +pyarrow \ No newline at end of file -- GitLab