rebranch
This commit is contained in:
651207
dataset/concatenated_split_files1.csv
Normal file
651207
dataset/concatenated_split_files1.csv
Normal file
File diff suppressed because it is too large
Load Diff
4775
dataset/failed_requests_2.csv
Normal file
4775
dataset/failed_requests_2.csv
Normal file
File diff suppressed because it is too large
Load Diff
6083
dataset/hasExecutable.csv
Normal file
6083
dataset/hasExecutable.csv
Normal file
File diff suppressed because it is too large
Load Diff
12472
dataset/ipadd.csv
Normal file
12472
dataset/ipadd.csv
Normal file
File diff suppressed because it is too large
Load Diff
104
dataset/load_data.py
Normal file
104
dataset/load_data.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import csv
|
||||
import os
|
||||
import requests
|
||||
import concurrent.futures
|
||||
|
||||
# Define the endpoint URL
|
||||
endpoint_url = "http://localhost:8080/v1/qrcodetypes/scan"
|
||||
|
||||
# Path to the CSV file
|
||||
csv_file_path = "hasExecutable.csv"
|
||||
|
||||
# Directory to store the split CSV files
|
||||
split_files_dir = "split_csv_files"
|
||||
os.makedirs(split_files_dir, exist_ok=True)
|
||||
|
||||
# File to store failed requests
|
||||
failed_requests_file = "failed_requests.csv"
|
||||
|
||||
# Final concatenated CSV file
|
||||
final_concatenated_file = "concatenated_split_files.csv"
|
||||
|
||||
# Function to ensure URL starts with http:// or https://
|
||||
def ensure_url_prefix(url):
|
||||
if not (url.startswith("http://") or url.startswith("https://")):
|
||||
return "https://" + url
|
||||
return url
|
||||
|
||||
# Read the CSV file and split into 199 files
|
||||
def split_csv_file(csv_file_path, split_files_dir, num_splits=199):
|
||||
with open(csv_file_path, newline='') as csvfile:
|
||||
reader = list(csv.DictReader(csvfile))
|
||||
total_rows = len(reader)
|
||||
rows_per_file = total_rows // num_splits
|
||||
|
||||
for i in range(num_splits):
|
||||
split_file_path = os.path.join(split_files_dir, f"split_file_{i+1}.csv")
|
||||
with open(split_file_path, 'w', newline='') as split_file:
|
||||
writer = csv.DictWriter(split_file, fieldnames=['url', 'type'])
|
||||
writer.writeheader()
|
||||
start_index = i * rows_per_file
|
||||
end_index = (i + 1) * rows_per_file if i != num_splits - 1 else total_rows
|
||||
for row in reader[start_index:end_index]:
|
||||
row['url'] = ensure_url_prefix(row['url'])
|
||||
writer.writerow(row)
|
||||
|
||||
# Function to process a CSV file and send POST requests
|
||||
def process_csv_file(csv_file_path):
|
||||
failed_requests = []
|
||||
with open(csv_file_path, newline='') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
url = row['url'] # Column header for URL is 'url'
|
||||
response = requests.post(endpoint_url, json={"data": url})
|
||||
if response.status_code == 200:
|
||||
print(f"Successfully sent data: {url}")
|
||||
else:
|
||||
print(f"Failed to send data: {url}, Status code: {response.status_code}")
|
||||
failed_requests.append({"url": url, "status_code": response.status_code})
|
||||
return failed_requests
|
||||
|
||||
# Function to write failed requests to a CSV file
|
||||
def write_failed_requests(failed_requests):
|
||||
if not failed_requests:
|
||||
return
|
||||
with open(failed_requests_file, 'w', newline='') as csvfile:
|
||||
fieldnames = ['url', 'status_code']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for request in failed_requests:
|
||||
writer.writerow(request)
|
||||
|
||||
# Function to concatenate all split CSV files into one
|
||||
def concatenate_csv_files(split_files_dir, output_file):
|
||||
fieldnames = ['url', 'type']
|
||||
with open(output_file, 'w', newline='') as outfile:
|
||||
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for file in os.listdir(split_files_dir):
|
||||
if file.endswith('.csv'):
|
||||
with open(os.path.join(split_files_dir, file), newline='') as infile:
|
||||
reader = csv.DictReader(infile)
|
||||
for row in reader:
|
||||
writer.writerow(row)
|
||||
|
||||
# Split the original CSV file into 199 parts
|
||||
split_csv_file(csv_file_path, split_files_dir)
|
||||
|
||||
# Get the list of split CSV files
|
||||
split_files = [os.path.join(split_files_dir, file) for file in os.listdir(split_files_dir) if file.endswith('.csv')]
|
||||
|
||||
# Execute the requests concurrently with 199 threads
|
||||
all_failed_requests = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=199) as executor:
|
||||
futures = [executor.submit(process_csv_file, split_file) for split_file in split_files]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
all_failed_requests.extend(future.result())
|
||||
|
||||
# Write all failed requests to a file
|
||||
write_failed_requests(all_failed_requests)
|
||||
|
||||
# Concatenate all split CSV files into one final file
|
||||
concatenate_csv_files(split_files_dir, final_concatenated_file)
|
||||
|
||||
print("Processing completed.")
|
||||
651199
dataset/malicious_phish.csv
Normal file
651199
dataset/malicious_phish.csv
Normal file
File diff suppressed because it is too large
Load Diff
40
dataset/map_type.py
Normal file
40
dataset/map_type.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import pandas as pd
|
||||
|
||||
# Load the CSV files
|
||||
file1 = pd.read_csv('concatenated_split_files1.csv')
|
||||
file2 = pd.read_csv('_select_from_safeqr_url_url_left_join_safeqr_qr_code_qr_on_qr_id_202408101634.csv')
|
||||
|
||||
# Function to strip 'http://' or 'https://' from a URL
|
||||
def strip_protocol(url):
|
||||
if isinstance(url, str):
|
||||
return url.replace('https://', '').replace('http://', '')
|
||||
return url
|
||||
|
||||
# Apply the strip function to both file1 and file2 URLs
|
||||
file1['url_stripped'] = file1['url'].apply(strip_protocol)
|
||||
file2['contents_stripped'] = file2['contents'].apply(strip_protocol)
|
||||
|
||||
# Create a dictionary from the second file for quick lookup of type and qr_code_id
|
||||
url_type_qr_dict = dict(zip(file2['contents_stripped'], zip(file2['result_category'], file2['qr_code_id'])))
|
||||
|
||||
# Prepare a copy of file2 to modify without affecting the original
|
||||
file2_copy = file2.copy()
|
||||
|
||||
# Fill in the result_category in file2_copy
|
||||
file2_copy['result_category'] = file2_copy['contents_stripped'].map(lambda x: url_type_qr_dict[x][0] if x in url_type_qr_dict else None)
|
||||
|
||||
# Drop the id and stripped columns in file2_copy
|
||||
file2_copy = file2_copy.drop(columns=['id', 'contents_stripped'])
|
||||
|
||||
# Prepare a copy of file1 to modify without affecting the original
|
||||
file1_copy = file1.copy()
|
||||
|
||||
# Fill in the qr_code_id in file1_copy based on the match from file2
|
||||
file1_copy['qr_code_id'] = file1_copy['url_stripped'].map(lambda x: url_type_qr_dict[x][1] if x in url_type_qr_dict else None)
|
||||
|
||||
# Drop the stripped column in file1_copy
|
||||
file1_copy = file1_copy.drop(columns=['url_stripped'])
|
||||
|
||||
# Save the updated copies to new CSV files
|
||||
file1_copy.to_csv('file1_updated.csv', index=False)
|
||||
file2_copy.to_csv('db_updated.csv', index=False)
|
||||
31138
dataset/ssl_error.csv
Normal file
31138
dataset/ssl_error.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
dataset/url_db_cleaned.csv.zip
Normal file
BIN
dataset/url_db_cleaned.csv.zip
Normal file
Binary file not shown.
Reference in New Issue
Block a user