Merge remote-tracking branch 'origin/feature-gmail-scan' into dev

This commit is contained in:
heyethereum
2024-08-12 23:04:00 +08:00
40 changed files with 144 additions and 0 deletions

View File

@@ -0,0 +1,134 @@
package com.safeqr.app.spark.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class URLFeatures {
private Long domain;
private Long subdomain;
private Long topLevelDomain;
private Long query;
private Long fragment;
private Long redirect;
private Long path;
private Long redirectChain;
private Long hstsHeader;
private Long sslStripping;
private Long hostnameEmbedding;
private Long javascriptCheck;
private Long shorteningService;
private Long hasIpAddress;
private Long trackingDescriptions;
private Long urlEncoding;
private Long hasExecutable;
private Long tls;
private Long contents;
private String target; // This is the label, may be null if predicting
// Custom setter for tls (qr_code_type_id)
public void setTls(Long tls) {
if (tls != null) {
this.tls = tls == 1 ? 0 : tls == 9 ? 1 : tls;
} else {
this.tls = 0L;
}
}
// Custom setter for hostnameEmbedding and other similar columns
public void setHostnameEmbedding(Long hostnameEmbedding) {
this.hostnameEmbedding = (hostnameEmbedding != null && hostnameEmbedding != 0) ? 1L : 0L;
}
public void setJavascriptCheck(Long javascriptCheck) {
this.javascriptCheck = (javascriptCheck != null && javascriptCheck != 0) ? 1L : 0L;
}
public void setShorteningService(Long shorteningService) {
this.shorteningService = (shorteningService != null && shorteningService != 0) ? 1L : 0L;
}
public void setHasIpAddress(Long hasIpAddress) {
this.hasIpAddress = (hasIpAddress != null && hasIpAddress != 0) ? 1L : 0L;
}
public void setUrlEncoding(Long urlEncoding) {
this.urlEncoding = (urlEncoding != null && urlEncoding != 0) ? 1L : 0L;
}
public void setHasExecutable(Long hasExecutable) {
this.hasExecutable = (hasExecutable != null && hasExecutable != 0) ? 1L : 0L;
}
public void setTrackingDescriptions(Long trackingDescriptions) {
this.trackingDescriptions = (trackingDescriptions != null && trackingDescriptions != 0) ? 1L : 0L;
}
// Custom setter for sslStripping
public void setSslStripping(String sslStripping) {
if (sslStripping != null && "true".equalsIgnoreCase(sslStripping)) {
this.sslStripping = 1L;
} else {
this.sslStripping = 0L;
}
}
// Custom setter for hstsHeader
public void setHstsHeader(String hstsHeader) {
if (hstsHeader == null || "0".equals(hstsHeader)) {
this.hstsHeader = 0L;
} else if (hstsHeader.startsWith("{") && hstsHeader.endsWith("}")) {
Pattern pattern = Pattern.compile("\"(.*?)\"");
Matcher matcher = pattern.matcher(hstsHeader);
if (matcher.find() && matcher.group(1).toLowerCase().contains("no")) {
this.hstsHeader = 0L;
} else {
this.hstsHeader = 1L;
}
} else {
this.hstsHeader = 0L;
}
}
// Custom setters for calculating string lengths
public void setDomain(String domain) {
this.domain = (domain != null) ? (long) domain.length() : 0L;
}
public void setSubdomain(String subdomain) {
this.subdomain = (subdomain != null) ? (long) subdomain.length() : 0L;
}
public void setTopLevelDomain(String topLevelDomain) {
this.topLevelDomain = (topLevelDomain != null) ? (long) topLevelDomain.length() : 0L;
}
public void setQuery(String query) {
this.query = (query != null) ? (long) query.length() : 0L;
}
public void setFragment(String fragment) {
this.fragment = (fragment != null) ? (long) fragment.length() : 0L;
}
public void setPath(String path) {
this.path = (path != null) ? (long) path.length() : 0L;
}
public void setRedirectChain(String redirectChain) {
this.redirectChain = (redirectChain != null) ? (long) redirectChain.length() : 0L;
}
public void setContents(String contents) {
this.contents = (contents != null) ? (long) contents.length() : 0L;
}
}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1723422050490,"sparkVersion":"3.4.3","uid":"PipelineModel_4ecdd9f71524","paramMap":{"stageUids":["StringIndexer_d3c63289c493","VectorAssembler_517fc429fbfb","RandomForestClassifier_4909b7ca2bbe"]},"defaultParamMap":{}}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1723422050930,"sparkVersion":"3.4.3","uid":"StringIndexer_d3c63289c493","paramMap":{"outputCol":"indexed_target","inputCol":"target"},"defaultParamMap":{"stringOrderType":"frequencyDesc","handleInvalid":"error","outputCol":"StringIndexer_d3c63289c493__output"}}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1723422054241,"sparkVersion":"3.4.3","uid":"VectorAssembler_517fc429fbfb","paramMap":{"inputCols":["domain","subdomain","top_level_domain","query","fragment","redirect","path","redirect_chain","hsts_header","ssl_stripping","hostname_embedding","javascript_check","shortening_service","has_ip_address","tracking_descriptions","url_encoding","has_executable","tls","contents"],"outputCol":"features"},"defaultParamMap":{"handleInvalid":"error","outputCol":"VectorAssembler_517fc429fbfb__output"}}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.classification.RandomForestClassificationModel","timestamp":1723422054559,"sparkVersion":"3.4.3","uid":"RandomForestClassifier_4909b7ca2bbe","paramMap":{"featuresCol":"features","numTrees":100,"maxDepth":10,"labelCol":"indexed_target"},"defaultParamMap":{"featuresCol":"features","rawPredictionCol":"rawPrediction","maxBins":32,"predictionCol":"prediction","minInstancesPerNode":1,"minWeightFractionPerNode":0.0,"cacheNodeIds":false,"minInfoGain":0.0,"numTrees":20,"maxDepth":5,"impurity":"gini","subsamplingRate":1.0,"leafCol":"","labelCol":"label","probabilityCol":"probability","bootstrap":true,"featureSubsetStrategy":"auto","checkpointInterval":10,"maxMemoryInMB":256,"seed":6182040365248539008},"numFeatures":19,"numClasses":4,"numTrees":100}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.Pipeline","timestamp":1723422048367,"sparkVersion":"3.4.3","uid":"Pipeline_58a1fe22f286","paramMap":{"stageUids":["StringIndexer_d3c63289c493","VectorAssembler_517fc429fbfb","RandomForestClassifier_4909b7ca2bbe"]},"defaultParamMap":{}}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.StringIndexer","timestamp":1723422049018,"sparkVersion":"3.4.3","uid":"StringIndexer_d3c63289c493","paramMap":{"outputCol":"indexed_target","inputCol":"target"},"defaultParamMap":{"stringOrderType":"frequencyDesc","handleInvalid":"error","outputCol":"StringIndexer_d3c63289c493__output"}}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1723422049580,"sparkVersion":"3.4.3","uid":"VectorAssembler_517fc429fbfb","paramMap":{"inputCols":["domain","subdomain","top_level_domain","query","fragment","redirect","path","redirect_chain","hsts_header","ssl_stripping","hostname_embedding","javascript_check","shortening_service","has_ip_address","tracking_descriptions","url_encoding","has_executable","tls","contents"],"outputCol":"features"},"defaultParamMap":{"handleInvalid":"error","outputCol":"VectorAssembler_517fc429fbfb__output"}}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.classification.RandomForestClassifier","timestamp":1723422050089,"sparkVersion":"3.4.3","uid":"RandomForestClassifier_4909b7ca2bbe","paramMap":{"featuresCol":"features","labelCol":"indexed_target"},"defaultParamMap":{"featuresCol":"features","rawPredictionCol":"rawPrediction","maxBins":32,"predictionCol":"prediction","minInstancesPerNode":1,"minWeightFractionPerNode":0.0,"cacheNodeIds":false,"minInfoGain":0.0,"numTrees":20,"maxDepth":5,"impurity":"gini","subsamplingRate":1.0,"leafCol":"","labelCol":"label","probabilityCol":"probability","bootstrap":true,"featureSubsetStrategy":"auto","checkpointInterval":10,"maxMemoryInMB":256,"seed":6182040365248539008}}

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator","timestamp":1723422047764,"sparkVersion":"3.4.3","uid":"MulticlassClassificationEvaluator_f31cf4d2b0db","paramMap":{"metricName":"accuracy","labelCol":"indexed_target"},"defaultParamMap":{"eps":1.0E-15,"beta":1.0,"metricName":"f1","predictionCol":"prediction","labelCol":"label","metricLabel":0.0,"probabilityCol":"probability"}}

Binary file not shown.

View File

@@ -0,0 +1 @@
{"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1723422046660,"sparkVersion":"3.4.3","uid":"CrossValidatorModel_5c96b33c8d82","paramMap":{"seed":-2084793586583917283,"numFolds":5,"foldCol":"","estimatorParamMaps":[[{"parent":"RandomForestClassifier_4909b7ca2bbe","name":"numTrees","value":"100","isJson":"true"},{"parent":"RandomForestClassifier_4909b7ca2bbe","name":"maxDepth","value":"10","isJson":"true"}]]},"defaultParamMap":{"seed":880116102,"numFolds":3,"foldCol":""},"avgMetrics":[0.8736361548764979],"persistSubModels":false}