first commit

2024-08-13 02:03:37 +08:00
commit 9284abbae2
16 changed files with 1357274 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/app/dockerfile
+++ b/app/dockerfile
@@ -0,0 +1,23 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the requirements file into the container at /app
+COPY requirements.txt .
+
+# Install any dependencies specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the model file into the container
+COPY random_forest_model.pkl /app/
+
+# Copy the rest of the working directory contents into the container at /app
+COPY . .
+
+# Expose the port the app runs on
+EXPOSE 8000
+
+# Run the FastAPI application using Uvicorn
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,66 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+import joblib
+import pandas as pd
+
+# Initialize the FastAPI app
+app = FastAPI()
+
+# Load the trained model
+model = joblib.load('random_forest_model.pkl')
+
+# Define the input data structure using Pydantic
+class InputData(BaseModel):
+    domain: int
+    subdomain: int
+    top_level_domain: int
+    query: int
+    fragment: int
+    redirect: int
+    path: int
+    redirect_chain: int
+    hsts_header: int
+    ssl_stripping: int
+    hostname_embedding: int
+    javascript_check: int
+    shortening_service: int
+    has_ip_address: int
+    tracking_descriptions: int
+    url_encoding: int
+    has_executable: int
+    tls: int
+    contents: int
+
+# Define a mapping from numerical predictions to class labels
+class_mapping = {
+    0: "Benign",
+    1: "Defacement",
+    2: "Malware",
+    3: "Phishing"
+}
+
+# Define a prediction endpoint
+@app.post("/predict")
+def predict(data: InputData):
+    # Convert input data to a dictionary and wrap it in a list
+    input_data = data.dict()
+    input_df = pd.DataFrame([input_data], columns=[
+        'domain', 'subdomain', 'top_level_domain', 'query', 
+        'fragment', 'redirect', 'path', 'redirect_chain', 
+        'hsts_header', 'ssl_stripping', 'hostname_embedding', 
+        'javascript_check', 'shortening_service', 'has_ip_address', 
+        'tracking_descriptions', 'url_encoding', 'has_executable', 
+        'tls', 'contents'
+    ])
+
+    # Make a prediction using the loaded model
+    prediction = model.predict(input_df)[0]
+
+    # Map the prediction to the class label
+    prediction_label = class_mapping.get(prediction, "Unknown")
+
+    # Return the class label as the prediction
+    return prediction_label
+
+# Running the FastAPI app
+# uvicorn main:app --reload (Use this command to run the FastAPI app)
--- a/app/random_forest_model.pkl
+++ b/app/random_forest_model.pkl
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -0,0 +1,5 @@
+fastapi==0.112.0
+uvicorn==0.30.5
+pandas==2.1.3
+scikit-learn==1.3.2
+joblib==1.4.2
--- a/dataset/.DS_Store
+++ b/dataset/.DS_Store
--- a/dataset/concatenated_split_files1.csv
+++ b/dataset/concatenated_split_files1.csv
--- a/dataset/failed_requests_2.csv
+++ b/dataset/failed_requests_2.csv
--- a/dataset/hasExecutable.csv
+++ b/dataset/hasExecutable.csv
--- a/dataset/ipadd.csv
+++ b/dataset/ipadd.csv
--- a/dataset/load_data.py
+++ b/dataset/load_data.py
@@ -0,0 +1,104 @@
+import csv
+import os
+import requests
+import concurrent.futures
+
+# Define the endpoint URL
+endpoint_url = "http://localhost:8080/v1/qrcodetypes/scan"
+
+# Path to the CSV file
+csv_file_path = "hasExecutable.csv"
+
+# Directory to store the split CSV files
+split_files_dir = "split_csv_files"
+os.makedirs(split_files_dir, exist_ok=True)
+
+# File to store failed requests
+failed_requests_file = "failed_requests.csv"
+
+# Final concatenated CSV file
+final_concatenated_file = "concatenated_split_files.csv"
+
+# Function to ensure URL starts with http:// or https://
+def ensure_url_prefix(url):
+    if not (url.startswith("http://") or url.startswith("https://")):
+        return "https://" + url
+    return url
+
+# Read the CSV file and split into 199 files
+def split_csv_file(csv_file_path, split_files_dir, num_splits=199):
+    with open(csv_file_path, newline='') as csvfile:
+        reader = list(csv.DictReader(csvfile))
+        total_rows = len(reader)
+        rows_per_file = total_rows // num_splits
+        
+        for i in range(num_splits):
+            split_file_path = os.path.join(split_files_dir, f"split_file_{i+1}.csv")
+            with open(split_file_path, 'w', newline='') as split_file:
+                writer = csv.DictWriter(split_file, fieldnames=['url', 'type'])
+                writer.writeheader()
+                start_index = i * rows_per_file
+                end_index = (i + 1) * rows_per_file if i != num_splits - 1 else total_rows
+                for row in reader[start_index:end_index]:
+                    row['url'] = ensure_url_prefix(row['url'])
+                    writer.writerow(row)
+
+# Function to process a CSV file and send POST requests
+def process_csv_file(csv_file_path):
+    failed_requests = []
+    with open(csv_file_path, newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            url = row['url']  # Column header for URL is 'url'
+            response = requests.post(endpoint_url, json={"data": url})
+            if response.status_code == 200:
+                print(f"Successfully sent data: {url}")
+            else:
+                print(f"Failed to send data: {url}, Status code: {response.status_code}")
+                failed_requests.append({"url": url, "status_code": response.status_code})
+    return failed_requests
+
+# Function to write failed requests to a CSV file
+def write_failed_requests(failed_requests):
+    if not failed_requests:
+        return
+    with open(failed_requests_file, 'w', newline='') as csvfile:
+        fieldnames = ['url', 'status_code']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for request in failed_requests:
+            writer.writerow(request)
+
+# Function to concatenate all split CSV files into one
+def concatenate_csv_files(split_files_dir, output_file):
+    fieldnames = ['url', 'type']
+    with open(output_file, 'w', newline='') as outfile:
+        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
+        writer.writeheader()
+        for file in os.listdir(split_files_dir):
+            if file.endswith('.csv'):
+                with open(os.path.join(split_files_dir, file), newline='') as infile:
+                    reader = csv.DictReader(infile)
+                    for row in reader:
+                        writer.writerow(row)
+
+# Split the original CSV file into 199 parts
+split_csv_file(csv_file_path, split_files_dir)
+
+# Get the list of split CSV files
+split_files = [os.path.join(split_files_dir, file) for file in os.listdir(split_files_dir) if file.endswith('.csv')]
+
+# Execute the requests concurrently with 199 threads
+all_failed_requests = []
+with concurrent.futures.ThreadPoolExecutor(max_workers=199) as executor:
+    futures = [executor.submit(process_csv_file, split_file) for split_file in split_files]
+    for future in concurrent.futures.as_completed(futures):
+        all_failed_requests.extend(future.result())
+
+# Write all failed requests to a file
+write_failed_requests(all_failed_requests)
+
+# Concatenate all split CSV files into one final file
+concatenate_csv_files(split_files_dir, final_concatenated_file)
+
+print("Processing completed.")
--- a/dataset/malicious_phish.csv
+++ b/dataset/malicious_phish.csv
--- a/dataset/map_type.py
+++ b/dataset/map_type.py
@@ -0,0 +1,40 @@
+import pandas as pd
+
+# Load the CSV files
+file1 = pd.read_csv('concatenated_split_files1.csv')  
+file2 = pd.read_csv('_select_from_safeqr_url_url_left_join_safeqr_qr_code_qr_on_qr_id_202408101634.csv') 
+
+# Function to strip 'http://' or 'https://' from a URL
+def strip_protocol(url):
+    if isinstance(url, str):
+        return url.replace('https://', '').replace('http://', '')
+    return url
+
+# Apply the strip function to both file1 and file2 URLs
+file1['url_stripped'] = file1['url'].apply(strip_protocol)
+file2['contents_stripped'] = file2['contents'].apply(strip_protocol)
+
+# Create a dictionary from the second file for quick lookup of type and qr_code_id
+url_type_qr_dict = dict(zip(file2['contents_stripped'], zip(file2['result_category'], file2['qr_code_id'])))
+
+# Prepare a copy of file2 to modify without affecting the original
+file2_copy = file2.copy()
+
+# Fill in the result_category in file2_copy
+file2_copy['result_category'] = file2_copy['contents_stripped'].map(lambda x: url_type_qr_dict[x][0] if x in url_type_qr_dict else None)
+
+# Drop the id and stripped columns in file2_copy
+file2_copy = file2_copy.drop(columns=['id', 'contents_stripped'])
+
+# Prepare a copy of file1 to modify without affecting the original
+file1_copy = file1.copy()
+
+# Fill in the qr_code_id in file1_copy based on the match from file2
+file1_copy['qr_code_id'] = file1_copy['url_stripped'].map(lambda x: url_type_qr_dict[x][1] if x in url_type_qr_dict else None)
+
+# Drop the stripped column in file1_copy
+file1_copy = file1_copy.drop(columns=['url_stripped'])
+
+# Save the updated copies to new CSV files
+file1_copy.to_csv('file1_updated.csv', index=False)
+file2_copy.to_csv('db_updated.csv', index=False)
--- a/dataset/ssl_error.csv
+++ b/dataset/ssl_error.csv
--- a/dataset/url_db_cleaned.csv.zip
+++ b/dataset/url_db_cleaned.csv.zip