first commit

2024-08-13 02:03:37 +08:00
commit 9284abbae2
16 changed files with 1357274 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,162 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/app/dockerfile
+++ b/app/dockerfile
@@ -0,0 +1,23 @@
 # Use an official Python runtime as a parent image
 FROM python:3.9-slim
 # Set the working directory in the container
 WORKDIR /app
 # Copy the requirements file into the container at /app
 COPY requirements.txt .
 # Install any dependencies specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the model file into the container
 COPY random_forest_model.pkl /app/
 # Copy the rest of the working directory contents into the container at /app
 COPY . .
 # Expose the port the app runs on
 EXPOSE 8000
 # Run the FastAPI application using Uvicorn
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,66 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 import joblib
 import pandas as pd
 # Initialize the FastAPI app
 app = FastAPI()
 # Load the trained model
 model = joblib.load('random_forest_model.pkl')
 # Define the input data structure using Pydantic
 class InputData(BaseModel):
    domain: int
    subdomain: int
    top_level_domain: int
    query: int
    fragment: int
    redirect: int
    path: int
    redirect_chain: int
    hsts_header: int
    ssl_stripping: int
    hostname_embedding: int
    javascript_check: int
    shortening_service: int
    has_ip_address: int
    tracking_descriptions: int
    url_encoding: int
    has_executable: int
    tls: int
    contents: int
 # Define a mapping from numerical predictions to class labels
 class_mapping = {
    0: "Benign",
    1: "Defacement",
    2: "Malware",
    3: "Phishing"
 }
 # Define a prediction endpoint
@app.post("/predict")
 def predict(data: InputData):
    # Convert input data to a dictionary and wrap it in a list
    input_data = data.dict()
    input_df = pd.DataFrame([input_data], columns=[
        'domain', 'subdomain', 'top_level_domain', 'query', 
        'fragment', 'redirect', 'path', 'redirect_chain', 
        'hsts_header', 'ssl_stripping', 'hostname_embedding', 
        'javascript_check', 'shortening_service', 'has_ip_address', 
        'tracking_descriptions', 'url_encoding', 'has_executable', 
        'tls', 'contents'
    ])
    # Make a prediction using the loaded model
    prediction = model.predict(input_df)[0]
    # Map the prediction to the class label
    prediction_label = class_mapping.get(prediction, "Unknown")
    # Return the class label as the prediction
    return prediction_label
 # Running the FastAPI app
 # uvicorn main:app --reload (Use this command to run the FastAPI app)
--- a/app/random_forest_model.pkl
+++ b/app/random_forest_model.pkl
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -0,0 +1,5 @@
 fastapi==0.112.0
 uvicorn==0.30.5
 pandas==2.1.3
 scikit-learn==1.3.2
 joblib==1.4.2
--- a/dataset/.DS_Store
+++ b/dataset/.DS_Store
--- a/dataset/concatenated_split_files1.csv
+++ b/dataset/concatenated_split_files1.csv
--- a/dataset/failed_requests_2.csv
+++ b/dataset/failed_requests_2.csv
--- a/dataset/hasExecutable.csv
+++ b/dataset/hasExecutable.csv
--- a/dataset/ipadd.csv
+++ b/dataset/ipadd.csv
--- a/dataset/load_data.py
+++ b/dataset/load_data.py
@@ -0,0 +1,104 @@
 import csv
 import os
 import requests
 import concurrent.futures
 # Define the endpoint URL
 endpoint_url = "http://localhost:8080/v1/qrcodetypes/scan"
 # Path to the CSV file
 csv_file_path = "hasExecutable.csv"
 # Directory to store the split CSV files
 split_files_dir = "split_csv_files"
 os.makedirs(split_files_dir, exist_ok=True)
 # File to store failed requests
 failed_requests_file = "failed_requests.csv"
 # Final concatenated CSV file
 final_concatenated_file = "concatenated_split_files.csv"
 # Function to ensure URL starts with http:// or https://
 def ensure_url_prefix(url):
    if not (url.startswith("http://") or url.startswith("https://")):
        return "https://" + url
    return url
 # Read the CSV file and split into 199 files
 def split_csv_file(csv_file_path, split_files_dir, num_splits=199):
    with open(csv_file_path, newline='') as csvfile:
        reader = list(csv.DictReader(csvfile))
        total_rows = len(reader)
        rows_per_file = total_rows // num_splits
        for i in range(num_splits):
            split_file_path = os.path.join(split_files_dir, f"split_file_{i+1}.csv")
            with open(split_file_path, 'w', newline='') as split_file:
                writer = csv.DictWriter(split_file, fieldnames=['url', 'type'])
                writer.writeheader()
                start_index = i * rows_per_file
                end_index = (i + 1) * rows_per_file if i != num_splits - 1 else total_rows
                for row in reader[start_index:end_index]:
                    row['url'] = ensure_url_prefix(row['url'])
                    writer.writerow(row)
 # Function to process a CSV file and send POST requests
 def process_csv_file(csv_file_path):
    failed_requests = []
    with open(csv_file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            url = row['url']  # Column header for URL is 'url'
            response = requests.post(endpoint_url, json={"data": url})
            if response.status_code == 200:
                print(f"Successfully sent data: {url}")
            else:
                print(f"Failed to send data: {url}, Status code: {response.status_code}")
                failed_requests.append({"url": url, "status_code": response.status_code})
    return failed_requests
 # Function to write failed requests to a CSV file
 def write_failed_requests(failed_requests):
    if not failed_requests:
        return
    with open(failed_requests_file, 'w', newline='') as csvfile:
        fieldnames = ['url', 'status_code']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for request in failed_requests:
            writer.writerow(request)
 # Function to concatenate all split CSV files into one
 def concatenate_csv_files(split_files_dir, output_file):
    fieldnames = ['url', 'type']
    with open(output_file, 'w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        for file in os.listdir(split_files_dir):
            if file.endswith('.csv'):
                with open(os.path.join(split_files_dir, file), newline='') as infile:
                    reader = csv.DictReader(infile)
                    for row in reader:
                        writer.writerow(row)
 # Split the original CSV file into 199 parts
 split_csv_file(csv_file_path, split_files_dir)
 # Get the list of split CSV files
 split_files = [os.path.join(split_files_dir, file) for file in os.listdir(split_files_dir) if file.endswith('.csv')]
 # Execute the requests concurrently with 199 threads
 all_failed_requests = []
 with concurrent.futures.ThreadPoolExecutor(max_workers=199) as executor:
    futures = [executor.submit(process_csv_file, split_file) for split_file in split_files]
    for future in concurrent.futures.as_completed(futures):
        all_failed_requests.extend(future.result())
 # Write all failed requests to a file
 write_failed_requests(all_failed_requests)
 # Concatenate all split CSV files into one final file
 concatenate_csv_files(split_files_dir, final_concatenated_file)
 print("Processing completed.")
--- a/dataset/malicious_phish.csv
+++ b/dataset/malicious_phish.csv
--- a/dataset/map_type.py
+++ b/dataset/map_type.py
@@ -0,0 +1,40 @@
 import pandas as pd
 # Load the CSV files
 file1 = pd.read_csv('concatenated_split_files1.csv')  
 file2 = pd.read_csv('_select_from_safeqr_url_url_left_join_safeqr_qr_code_qr_on_qr_id_202408101634.csv') 
 # Function to strip 'http://' or 'https://' from a URL
 def strip_protocol(url):
    if isinstance(url, str):
        return url.replace('https://', '').replace('http://', '')
    return url
 # Apply the strip function to both file1 and file2 URLs
 file1['url_stripped'] = file1['url'].apply(strip_protocol)
 file2['contents_stripped'] = file2['contents'].apply(strip_protocol)
 # Create a dictionary from the second file for quick lookup of type and qr_code_id
 url_type_qr_dict = dict(zip(file2['contents_stripped'], zip(file2['result_category'], file2['qr_code_id'])))
 # Prepare a copy of file2 to modify without affecting the original
 file2_copy = file2.copy()
 # Fill in the result_category in file2_copy
 file2_copy['result_category'] = file2_copy['contents_stripped'].map(lambda x: url_type_qr_dict[x][0] if x in url_type_qr_dict else None)
 # Drop the id and stripped columns in file2_copy
 file2_copy = file2_copy.drop(columns=['id', 'contents_stripped'])
 # Prepare a copy of file1 to modify without affecting the original
 file1_copy = file1.copy()
 # Fill in the qr_code_id in file1_copy based on the match from file2
 file1_copy['qr_code_id'] = file1_copy['url_stripped'].map(lambda x: url_type_qr_dict[x][1] if x in url_type_qr_dict else None)
 # Drop the stripped column in file1_copy
 file1_copy = file1_copy.drop(columns=['url_stripped'])
 # Save the updated copies to new CSV files
 file1_copy.to_csv('file1_updated.csv', index=False)
 file2_copy.to_csv('db_updated.csv', index=False)
--- a/dataset/ssl_error.csv
+++ b/dataset/ssl_error.csv
--- a/dataset/url_db_cleaned.csv.zip
+++ b/dataset/url_db_cleaned.csv.zip