About Dataset
The original dataset is downloaded from https://www.kaggle.com/datasets/sid321axn/malicious-urls-dataset which consists of a huge dataset of 651,191 URLs, out of which 428103 benign or safe URLs, 96457 defacement URLs, 94111 phishing URLs, and 32520 malware URLs. The dataset only made up of URL and target class. We further process that data that did the following checks:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
# Ensure that all rows are displayed
pd.set_option('display.max_rows', None)
# Ensure that all columns are displayed
pd.set_option('display.max_columns', None)
# Load the data
path = '/content/drive/MyDrive/Colab Notebooks/url_db_cleaned.csv'
data = pd.read_csv(path)
data.head()
Data Cleaning
# Drop the 'qr_code_id' and 'created_at' columns
data = data.drop(columns=['qr_code_id', 'created_at'])
# Replace all NaN and None values with 0
data = data.fillna(0)
# Replace the string "{}" with 0
data = data.replace("{}", 0)
# Rename the 'qr_code_id' column to 'tls'
data = data.rename(columns={'qr_code_type_id': 'tls'})
# Rename the 'qr_code_id' column to 'tls'
data = data.rename(columns={'result_category': 'target'})
# Change the value in 'tls' column: 0 when value is 1, 1 when value is 9
data['tls'] = data['tls'].replace({1: 0, 9: 1})
# Display the first few rows to verify
data.head(20)
# List of columns to modify to 1
columns_to_modify = ['hostname_embedding', 'javascript_check', 'shortening_service',
'has_ip_address', 'url_encoding', 'has_executable',
'tracking_descriptions']
# Update the values: set to 1 if not already 0
for column in columns_to_modify:
data[column] = data[column].apply(lambda x: 1 if x != 0 else 0)
# Convert 'ssl_stripping' column: set to 1 if the value is "true", else set to 0
data['ssl_stripping'] = data['ssl_stripping'].apply(lambda x: 1 if "true" in str(x).lower() else 0)
# Display the first few rows to verify changes
data.head(20)
import re
def process_hsts_header(value):
# If the value is already 0, return 0
if value == 0:
return 0
# Check if the value starts with '{' and ends with '}', indicating it's an array-like format
if isinstance(value, str) and value.startswith("{") and value.endswith("}"):
# Remove the curly braces and split the string by commas
items = re.findall(r'\"(.*?)\"', value)
# Check the first item in the parsed list
if items and 'no' in items[0].lower():
return 0
else:
return 1
else:
# If it's not in the expected format, return 0
return 0
# Apply the function to the 'hsts_header' column
data['hsts_header'] = data['hsts_header'].apply(process_hsts_header)
# Display the first few rows to verify the changes
data.head(20)
# List of columns to convert to number of characters
columns_to_convert = ['domain', 'top_level_domain', 'subdomain', 'query',
'fragment', 'path', 'redirect_chain', 'contents']
# Apply len() function to each of the specified columns to get the number of characters
for column in columns_to_convert:
data[column] = data[column].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
# Map the target class to numerical values
target_mapping = {
'benign': 0,
'defacement': 1,
'malware': 2,
'phishing': 3
}
data['target'] = data['target'].replace(target_mapping)
# Display the first few rows to verify the changes
data.head()
# Copy the data to avoid modifying the original DataFrame
data_encoded = data.copy()
# Encode categorical variables with integer encoding
for column in data_encoded.select_dtypes(include=['object', 'category']).columns:
data_encoded[column] = data_encoded[column].astype('category').cat.codes
# Calculate the correlation matrix
correlation_matrix = data_encoded.corr(method='pearson')
# Sort correlations in descending order
sorted_correlations = correlation_matrix.unstack().sort_values(ascending=False, key=abs)
# Remove duplicate correlations and self-correlations
sorted_correlations = sorted_correlations[(sorted_correlations < 1) &
(sorted_correlations > -1)].drop_duplicates()
# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, annot=True, fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()
# Display sorted correlations with background gradient (optional)
styled_correlations = correlation_matrix.style.background_gradient(cmap='coolwarm')
styled_correlations
# Copy the data to avoid modifying the original DataFrame
data_encoded = data.copy()
# Encode categorical variables with integer encoding
for column in data_encoded.select_dtypes(include=['object', 'category']).columns:
data_encoded[column] = data_encoded[column].astype('category').cat.codes
# Calculate the correlation matrix
correlation_matrix = data_encoded.corr(method='pearson')
# Extract correlations with the target class
target_correlations = correlation_matrix['target'].drop('target').sort_values(ascending=False, key=abs)
# Display correlations with the target class
print("Correlations with the target class in descending order:")
print(target_correlations)
Use stratified sampling to select 80% data for training and 20% for testing.
from sklearn.model_selection import StratifiedShuffleSplit
# Define the stratified shuffle split
stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
# Separate the features and target
X = data.drop(columns=['target'])
y = data['target']
# Perform the split
for train_index, test_index in stratified_split.split(X, y):
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
# Combine X and y for training and testing sets
train_data = X_train.copy()
train_data['target'] = y_train
test_data = X_test.copy()
test_data['target'] = y_test
# Display the number of samples in each set to verify
print(f"Training set size: {len(train_data)}")
print(f"Testing set size: {len(test_data)}")
# Display the first few rows of the training set
train_data.head()
train_data.info()
test_data.info()
train_data.describe(include='all')
# Verify the split
print("Training set size:", len(train_data))
print(train_data['target'].value_counts())
print("Testing set size:", len(test_data))
print(test_data['target'].value_counts())
Define Function to print Model Evaluation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
def evaluate_model(y_test, y_pred):
target_mapping = {0: "Benign", 1: "Defacement", 2: "Malware", 3: "Phishing"}
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=list(target_mapping.keys()))
# Plot confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=list(target_mapping.values()),
yticklabels=list(target_mapping.values()))
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()
# Generate and print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=list(target_mapping.values())))
# Example usage:
# evaluate_model(y_test, y_pred)
Random Forest Classifier
from math import e
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
# Step 1: Create a pipeline with MinMaxScaler and RandomForestClassifier
pipeline = Pipeline([
('scaler', MinMaxScaler()), # Scaling with MinMaxScaler
('rf', RandomForestClassifier(random_state=42)) # Random Forest classifier
])
# Step 2: Define the parameter grid for GridSearchCV
param_grid = {
'rf__n_estimators': [100],
'rf__max_depth': [15]
}
# Step 3: Initialize GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
# Step 4: Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)
# Step 5: Print the best parameters found by GridSearchCV
print("Best parameters found: ", grid_search.best_params_)
# Step 6: Evaluate the model with the best parameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# Evaluate Model
evaluate_model(y_test, y_pred)
import joblib
# Save the model
joblib.dump(best_model, '/content/drive/MyDrive/random_forest_model.pkl')
Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# Step 1: Create a pipeline with MinMaxScaler and MultinomialNB
pipeline = Pipeline([
('scaler', MinMaxScaler()), # Scaling with MinMaxScaler
('nb', MultinomialNB()) # Naive Bayes classifier
])
# Step 2: Define the parameter grid for GridSearchCV
# Naive Bayes doesn't have as many parameters as RandomForest, so the grid is smaller
param_grid = {
'nb__alpha': [0.1, 1.0, 10.0] # Smoothing parameter
}
# Step 3: Initialize GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
# Step 4: Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)
# Step 5: Print the best parameters found by GridSearchCV
print("Best parameters found: ", grid_search.best_params_)
# Step 6: Evaluate the model with the best parameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# Evaluate Model
evaluate_model(y_test, y_pred)
XGBoost (eXtreme Gradient Boosting)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
# Initialize the model
model = XGBClassifier(objective='multi:softmax', num_class=4, random_state=42)
# Fit the model to the training data
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Evaluate the model
evaluate_model(y_test, y_pred)
from sklearn.model_selection import RandomizedSearchCV
param_distributions = {
'n_estimators': [100, 200],
'max_depth': [6, 10],
'learning_rate': [0.1, 0.2]
}
randomized_search = RandomizedSearchCV(
model, param_distributions, n_iter=8, scoring='accuracy', cv=5, verbose=1, random_state=42, n_jobs=-1
)
randomized_search.fit(X_train, y_train)
# Best parameters found
print("Best parameters found: ", randomized_search.best_params_)
# Evaluate the best model
best_model = randomized_search.best_estimator_
y_pred = best_model.predict(X_test)
evaluate_model(y_test, y_pred)
import joblib
# Save the model
joblib.dump(best_model, '/content/drive/MyDrive/randomized_search_xgb_model-2.pkl')
Multilayer Perceptron Classifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('scaler', MinMaxScaler()),
('mlp', MLPClassifier(random_state=42, max_iter=300))
])
param_grid = {
'mlp__hidden_layer_sizes': [(50,), (100,)],
'mlp__activation': ['relu'],
'mlp__solver': ['adam'],
'mlp__alpha': [0.0001],
'mlp__learning_rate': ['constant']
}
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
evaluate_model(y_test, y_pred)