split into 2 diff files and use concurrent thread

This commit is contained in:
heyethereum
2024-08-05 08:37:28 +08:00
parent 5b88a4ca7c
commit 4feabce446
21 changed files with 651265 additions and 11 deletions

View File

@@ -1,5 +1,7 @@
import csv import csv
import os
import requests import requests
import concurrent.futures
# Define the endpoint URL # Define the endpoint URL
endpoint_url = "http://localhost:8080/v1/qrcodetypes/scan" endpoint_url = "http://localhost:8080/v1/qrcodetypes/scan"
@@ -7,21 +9,55 @@ endpoint_url = "http://localhost:8080/v1/qrcodetypes/scan"
# Path to the CSV file # Path to the CSV file
csv_file_path = "malicious_phish.csv" csv_file_path = "malicious_phish.csv"
# Directory to store the split CSV files
split_files_dir = "split_csv_files"
os.makedirs(split_files_dir, exist_ok=True)
# Function to ensure URL starts with http:// or https:// # Function to ensure URL starts with http:// or https://
def ensure_url_prefix(url): def ensure_url_prefix(url):
if not (url.startswith("http://") or url.startswith("https://")): if not (url.startswith("http://") or url.startswith("https://")):
return "https://" + url return "https://" + url
return url return url
# Read the CSV file, prefix URLs, and send POST requests # Read the CSV file and split into 20 files
with open(csv_file_path, newline='') as csvfile: def split_csv_file(csv_file_path, split_files_dir, num_splits=20):
reader = csv.DictReader(csvfile) with open(csv_file_path, newline='') as csvfile:
for row in reader: reader = list(csv.DictReader(csvfile))
url = row['url'] total_rows = len(reader)
url_with_prefix = ensure_url_prefix(url) rows_per_file = total_rows // num_splits
response = requests.post(endpoint_url, json={"data": url_with_prefix})
if response.status_code == 200: for i in range(num_splits):
print(f"Successfully sent data: {url_with_prefix}") split_file_path = os.path.join(split_files_dir, f"split_file_{i+1}.csv")
else: with open(split_file_path, 'w', newline='') as split_file:
print(f"Failed to send data: {url_with_prefix}, Status code: {response.status_code}") writer = csv.DictWriter(split_file, fieldnames=['url', 'type'])
writer.writeheader()
start_index = i * rows_per_file
end_index = (i + 1) * rows_per_file if i != num_splits - 1 else total_rows
for row in reader[start_index:end_index]:
row['url'] = ensure_url_prefix(row['url'])
writer.writerow(row)
# Function to process a CSV file and send POST requests
def process_csv_file(csv_file_path):
with open(csv_file_path, newline='') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
url = row['url'] # Column header for URL is 'url'
response = requests.post(endpoint_url, json={"data": url})
if response.status_code == 200:
print(f"Successfully sent data: {url}")
else:
print(f"Failed to send data: {url}, Status code: {response.status_code}")
# Split the original CSV file into 20 parts
split_csv_file(csv_file_path, split_files_dir)
# Get the list of split CSV files
split_files = [os.path.join(split_files_dir, file) for file in os.listdir(split_files_dir) if file.endswith('.csv')]
# Execute the requests concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(process_csv_file, split_file) for split_file in split_files]
concurrent.futures.wait(futures)
print("Processing completed.")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff