diff --git a/.gitignore b/.gitignore index b80df185eaf57e0836ddefa9bf212b489646aec5..218e544f91efda826625fc82a6e73d1a386ffe6c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ venv/ -solr_queries.db \ No newline at end of file +solr_queries.db +solr_queries.parquet diff --git a/extract_logs.py b/extract_logs.py index d442712c698b7622aeec6d68a6361f7730bda42b..96d2bf892de78df34119466ffac49738ba9795fb 100644 --- a/extract_logs.py +++ b/extract_logs.py @@ -1,57 +1,61 @@ import os import re -import sqlite3 from urllib.parse import parse_qs, urlencode +import pandas as pd +import concurrent.futures +import threading def clean_query(query_string): - # Parse die Query-Parameter params = parse_qs(query_string) - - # Entferne unerwünschte Parameter for param in ["shard", "shard.url", "isShard", "NOW", "wt"]: params.pop(param, None) - - # Konvertiere zurück zu einem Query-String return urlencode(params, doseq=True) -def extract_urls_from_logs(): - log_dir = "logs/" - log_pattern = re.compile(r"solr\.log\.\d+") +def process_file(filename): urls = [] + with open(filename, "r") as file: + for line in file: + if "o.a.s.c.S.Request" in line and "params=" in line: + params_match = re.search(r"params=\{(.+?)\}", line) + if params_match: + query_string = params_match.group(1) + clean_query_string = clean_query(query_string) + urls.append(f"/select?{clean_query_string}") + return urls - for filename in os.listdir(log_dir): - if log_pattern.match(filename): - with open(os.path.join(log_dir, filename), "r") as file: - for line in file: - if "o.a.s.c.S.Request" in line and "params=" in line: - params_match = re.search(r"params=\{(.+?)\}", line) - if params_match: - query_string = params_match.group(1) - clean_query_string = clean_query(query_string) - urls.append(f"/select?{clean_query_string}") - return urls +def extract_urls_from_logs(): + log_dir = "logs/" + log_pattern = re.compile(r"solr\.log\.\d+") + all_urls = [] + files_to_process = [ + os.path.join(log_dir, f) for f in os.listdir(log_dir) if log_pattern.match(f) + ] -def save_to_sqlite(urls): - conn = sqlite3.connect("solr_queries.db") - c = conn.cursor() + with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: + future_to_file = { + executor.submit(process_file, file): file for file in files_to_process + } + for future in concurrent.futures.as_completed(future_to_file): + file = future_to_file[future] + try: + urls = future.result() + all_urls.extend(urls) + except Exception as exc: + print(f"{file} generated an exception: {exc}") - # Erstelle die Tabelle, falls sie nicht existiert - c.execute("""CREATE TABLE IF NOT EXISTS queries - (id INTEGER PRIMARY KEY, url TEXT)""") + return all_urls - # Füge die URLs in die Datenbank ein - for url in urls: - c.execute("INSERT INTO queries (url) VALUES (?)", (url,)) - conn.commit() - conn.close() +def save_to_parquet(urls): + df = pd.DataFrame(urls, columns=["url"]) + df.to_parquet("solr_queries.parquet", engine="pyarrow") if __name__ == "__main__": urls = extract_urls_from_logs() - save_to_sqlite(urls) - print(f"{len(urls)} URLs wurden extrahiert und in die Datenbank gespeichert.") + save_to_parquet(urls) + print(f"{len(urls)} URLs wurden extrahiert und in das Parquet-File gespeichert.") diff --git a/locustfile.py b/locustfile.py index c916adead017e1c99bba3137b8ece97807f76b0c..0e72ce25caf488d524d469a00c3ab0f62042fa69 100644 --- a/locustfile.py +++ b/locustfile.py @@ -1,4 +1,4 @@ -import sqlite3 +import pandas as pd from locust import HttpUser, task, between import random @@ -7,15 +7,11 @@ class SolrUser(HttpUser): wait_time = between(1, 5) def on_start(self): - self.queries = self.load_queries_from_db() + self.queries = self.load_queries_from_parquet() - def load_queries_from_db(self): - conn = sqlite3.connect("solr_queries.db") - c = conn.cursor() - c.execute("SELECT url FROM queries") - queries = [row[0] for row in c.fetchall()] - conn.close() - return queries + def load_queries_from_parquet(self): + df = pd.read_parquet("solr_queries.parquet") + return df["url"].tolist() @task def search_solr(self): @@ -23,4 +19,4 @@ class SolrUser(HttpUser): query = random.choice(self.queries) self.client.get(query) else: - print("Keine Abfragen in der Datenbank gefunden.") + print("Keine Abfragen im Parquet-File gefunden.") diff --git a/requirements.txt b/requirements.txt index 2161115c4434450594b01ca81b41b36e7aacaa7a..c6c07501b5d7577990624036c3e2558dec93df4f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ locust locust-plugins +pandas +pyarrow \ No newline at end of file