first commit
This commit is contained in:
162
.gitignore
vendored
Normal file
162
.gitignore
vendored
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||||
|
.pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
23
app/dockerfile
Normal file
23
app/dockerfile
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
# Use an official Python runtime as a parent image
|
||||||
|
FROM python:3.9-slim
|
||||||
|
|
||||||
|
# Set the working directory in the container
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy the requirements file into the container at /app
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
# Install any dependencies specified in requirements.txt
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy the model file into the container
|
||||||
|
COPY random_forest_model.pkl /app/
|
||||||
|
|
||||||
|
# Copy the rest of the working directory contents into the container at /app
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Expose the port the app runs on
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Run the FastAPI application using Uvicorn
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||||
66
app/main.py
Normal file
66
app/main.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
from fastapi import FastAPI
|
||||||
|
from pydantic import BaseModel
|
||||||
|
import joblib
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Initialize the FastAPI app
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
# Load the trained model
|
||||||
|
model = joblib.load('random_forest_model.pkl')
|
||||||
|
|
||||||
|
# Define the input data structure using Pydantic
|
||||||
|
class InputData(BaseModel):
|
||||||
|
domain: int
|
||||||
|
subdomain: int
|
||||||
|
top_level_domain: int
|
||||||
|
query: int
|
||||||
|
fragment: int
|
||||||
|
redirect: int
|
||||||
|
path: int
|
||||||
|
redirect_chain: int
|
||||||
|
hsts_header: int
|
||||||
|
ssl_stripping: int
|
||||||
|
hostname_embedding: int
|
||||||
|
javascript_check: int
|
||||||
|
shortening_service: int
|
||||||
|
has_ip_address: int
|
||||||
|
tracking_descriptions: int
|
||||||
|
url_encoding: int
|
||||||
|
has_executable: int
|
||||||
|
tls: int
|
||||||
|
contents: int
|
||||||
|
|
||||||
|
# Define a mapping from numerical predictions to class labels
|
||||||
|
class_mapping = {
|
||||||
|
0: "Benign",
|
||||||
|
1: "Defacement",
|
||||||
|
2: "Malware",
|
||||||
|
3: "Phishing"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Define a prediction endpoint
|
||||||
|
@app.post("/predict")
|
||||||
|
def predict(data: InputData):
|
||||||
|
# Convert input data to a dictionary and wrap it in a list
|
||||||
|
input_data = data.dict()
|
||||||
|
input_df = pd.DataFrame([input_data], columns=[
|
||||||
|
'domain', 'subdomain', 'top_level_domain', 'query',
|
||||||
|
'fragment', 'redirect', 'path', 'redirect_chain',
|
||||||
|
'hsts_header', 'ssl_stripping', 'hostname_embedding',
|
||||||
|
'javascript_check', 'shortening_service', 'has_ip_address',
|
||||||
|
'tracking_descriptions', 'url_encoding', 'has_executable',
|
||||||
|
'tls', 'contents'
|
||||||
|
])
|
||||||
|
|
||||||
|
# Make a prediction using the loaded model
|
||||||
|
prediction = model.predict(input_df)[0]
|
||||||
|
|
||||||
|
# Map the prediction to the class label
|
||||||
|
prediction_label = class_mapping.get(prediction, "Unknown")
|
||||||
|
|
||||||
|
# Return the class label as the prediction
|
||||||
|
return prediction_label
|
||||||
|
|
||||||
|
# Running the FastAPI app
|
||||||
|
# uvicorn main:app --reload (Use this command to run the FastAPI app)
|
||||||
BIN
app/random_forest_model.pkl
Normal file
BIN
app/random_forest_model.pkl
Normal file
Binary file not shown.
5
app/requirements.txt
Normal file
5
app/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
fastapi==0.112.0
|
||||||
|
uvicorn==0.30.5
|
||||||
|
pandas==2.1.3
|
||||||
|
scikit-learn==1.3.2
|
||||||
|
joblib==1.4.2
|
||||||
BIN
dataset/.DS_Store
vendored
Normal file
BIN
dataset/.DS_Store
vendored
Normal file
Binary file not shown.
651207
dataset/concatenated_split_files1.csv
Normal file
651207
dataset/concatenated_split_files1.csv
Normal file
File diff suppressed because it is too large
Load Diff
4775
dataset/failed_requests_2.csv
Normal file
4775
dataset/failed_requests_2.csv
Normal file
File diff suppressed because it is too large
Load Diff
6083
dataset/hasExecutable.csv
Normal file
6083
dataset/hasExecutable.csv
Normal file
File diff suppressed because it is too large
Load Diff
12472
dataset/ipadd.csv
Normal file
12472
dataset/ipadd.csv
Normal file
File diff suppressed because it is too large
Load Diff
104
dataset/load_data.py
Normal file
104
dataset/load_data.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
# Define the endpoint URL
|
||||||
|
endpoint_url = "http://localhost:8080/v1/qrcodetypes/scan"
|
||||||
|
|
||||||
|
# Path to the CSV file
|
||||||
|
csv_file_path = "hasExecutable.csv"
|
||||||
|
|
||||||
|
# Directory to store the split CSV files
|
||||||
|
split_files_dir = "split_csv_files"
|
||||||
|
os.makedirs(split_files_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# File to store failed requests
|
||||||
|
failed_requests_file = "failed_requests.csv"
|
||||||
|
|
||||||
|
# Final concatenated CSV file
|
||||||
|
final_concatenated_file = "concatenated_split_files.csv"
|
||||||
|
|
||||||
|
# Function to ensure URL starts with http:// or https://
|
||||||
|
def ensure_url_prefix(url):
|
||||||
|
if not (url.startswith("http://") or url.startswith("https://")):
|
||||||
|
return "https://" + url
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Read the CSV file and split into 199 files
|
||||||
|
def split_csv_file(csv_file_path, split_files_dir, num_splits=199):
|
||||||
|
with open(csv_file_path, newline='') as csvfile:
|
||||||
|
reader = list(csv.DictReader(csvfile))
|
||||||
|
total_rows = len(reader)
|
||||||
|
rows_per_file = total_rows // num_splits
|
||||||
|
|
||||||
|
for i in range(num_splits):
|
||||||
|
split_file_path = os.path.join(split_files_dir, f"split_file_{i+1}.csv")
|
||||||
|
with open(split_file_path, 'w', newline='') as split_file:
|
||||||
|
writer = csv.DictWriter(split_file, fieldnames=['url', 'type'])
|
||||||
|
writer.writeheader()
|
||||||
|
start_index = i * rows_per_file
|
||||||
|
end_index = (i + 1) * rows_per_file if i != num_splits - 1 else total_rows
|
||||||
|
for row in reader[start_index:end_index]:
|
||||||
|
row['url'] = ensure_url_prefix(row['url'])
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
# Function to process a CSV file and send POST requests
|
||||||
|
def process_csv_file(csv_file_path):
|
||||||
|
failed_requests = []
|
||||||
|
with open(csv_file_path, newline='') as csvfile:
|
||||||
|
reader = csv.DictReader(csvfile)
|
||||||
|
for row in reader:
|
||||||
|
url = row['url'] # Column header for URL is 'url'
|
||||||
|
response = requests.post(endpoint_url, json={"data": url})
|
||||||
|
if response.status_code == 200:
|
||||||
|
print(f"Successfully sent data: {url}")
|
||||||
|
else:
|
||||||
|
print(f"Failed to send data: {url}, Status code: {response.status_code}")
|
||||||
|
failed_requests.append({"url": url, "status_code": response.status_code})
|
||||||
|
return failed_requests
|
||||||
|
|
||||||
|
# Function to write failed requests to a CSV file
|
||||||
|
def write_failed_requests(failed_requests):
|
||||||
|
if not failed_requests:
|
||||||
|
return
|
||||||
|
with open(failed_requests_file, 'w', newline='') as csvfile:
|
||||||
|
fieldnames = ['url', 'status_code']
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for request in failed_requests:
|
||||||
|
writer.writerow(request)
|
||||||
|
|
||||||
|
# Function to concatenate all split CSV files into one
|
||||||
|
def concatenate_csv_files(split_files_dir, output_file):
|
||||||
|
fieldnames = ['url', 'type']
|
||||||
|
with open(output_file, 'w', newline='') as outfile:
|
||||||
|
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for file in os.listdir(split_files_dir):
|
||||||
|
if file.endswith('.csv'):
|
||||||
|
with open(os.path.join(split_files_dir, file), newline='') as infile:
|
||||||
|
reader = csv.DictReader(infile)
|
||||||
|
for row in reader:
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
# Split the original CSV file into 199 parts
|
||||||
|
split_csv_file(csv_file_path, split_files_dir)
|
||||||
|
|
||||||
|
# Get the list of split CSV files
|
||||||
|
split_files = [os.path.join(split_files_dir, file) for file in os.listdir(split_files_dir) if file.endswith('.csv')]
|
||||||
|
|
||||||
|
# Execute the requests concurrently with 199 threads
|
||||||
|
all_failed_requests = []
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=199) as executor:
|
||||||
|
futures = [executor.submit(process_csv_file, split_file) for split_file in split_files]
|
||||||
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
all_failed_requests.extend(future.result())
|
||||||
|
|
||||||
|
# Write all failed requests to a file
|
||||||
|
write_failed_requests(all_failed_requests)
|
||||||
|
|
||||||
|
# Concatenate all split CSV files into one final file
|
||||||
|
concatenate_csv_files(split_files_dir, final_concatenated_file)
|
||||||
|
|
||||||
|
print("Processing completed.")
|
||||||
651199
dataset/malicious_phish.csv
Normal file
651199
dataset/malicious_phish.csv
Normal file
File diff suppressed because it is too large
Load Diff
40
dataset/map_type.py
Normal file
40
dataset/map_type.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Load the CSV files
|
||||||
|
file1 = pd.read_csv('concatenated_split_files1.csv')
|
||||||
|
file2 = pd.read_csv('_select_from_safeqr_url_url_left_join_safeqr_qr_code_qr_on_qr_id_202408101634.csv')
|
||||||
|
|
||||||
|
# Function to strip 'http://' or 'https://' from a URL
|
||||||
|
def strip_protocol(url):
|
||||||
|
if isinstance(url, str):
|
||||||
|
return url.replace('https://', '').replace('http://', '')
|
||||||
|
return url
|
||||||
|
|
||||||
|
# Apply the strip function to both file1 and file2 URLs
|
||||||
|
file1['url_stripped'] = file1['url'].apply(strip_protocol)
|
||||||
|
file2['contents_stripped'] = file2['contents'].apply(strip_protocol)
|
||||||
|
|
||||||
|
# Create a dictionary from the second file for quick lookup of type and qr_code_id
|
||||||
|
url_type_qr_dict = dict(zip(file2['contents_stripped'], zip(file2['result_category'], file2['qr_code_id'])))
|
||||||
|
|
||||||
|
# Prepare a copy of file2 to modify without affecting the original
|
||||||
|
file2_copy = file2.copy()
|
||||||
|
|
||||||
|
# Fill in the result_category in file2_copy
|
||||||
|
file2_copy['result_category'] = file2_copy['contents_stripped'].map(lambda x: url_type_qr_dict[x][0] if x in url_type_qr_dict else None)
|
||||||
|
|
||||||
|
# Drop the id and stripped columns in file2_copy
|
||||||
|
file2_copy = file2_copy.drop(columns=['id', 'contents_stripped'])
|
||||||
|
|
||||||
|
# Prepare a copy of file1 to modify without affecting the original
|
||||||
|
file1_copy = file1.copy()
|
||||||
|
|
||||||
|
# Fill in the qr_code_id in file1_copy based on the match from file2
|
||||||
|
file1_copy['qr_code_id'] = file1_copy['url_stripped'].map(lambda x: url_type_qr_dict[x][1] if x in url_type_qr_dict else None)
|
||||||
|
|
||||||
|
# Drop the stripped column in file1_copy
|
||||||
|
file1_copy = file1_copy.drop(columns=['url_stripped'])
|
||||||
|
|
||||||
|
# Save the updated copies to new CSV files
|
||||||
|
file1_copy.to_csv('file1_updated.csv', index=False)
|
||||||
|
file2_copy.to_csv('db_updated.csv', index=False)
|
||||||
31138
dataset/ssl_error.csv
Normal file
31138
dataset/ssl_error.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
dataset/url_db_cleaned.csv.zip
Normal file
BIN
dataset/url_db_cleaned.csv.zip
Normal file
Binary file not shown.
Reference in New Issue
Block a user