Merge remote-tracking branch 'origin/feature-gmail-scan' into dev
This commit is contained in:
134
src/main/java/com/safeqr/app/spark/model/URLFeatures.java
Normal file
134
src/main/java/com/safeqr/app/spark/model/URLFeatures.java
Normal file
@@ -0,0 +1,134 @@
|
|||||||
|
package com.safeqr.app.spark.model;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class URLFeatures {
|
||||||
|
private Long domain;
|
||||||
|
private Long subdomain;
|
||||||
|
private Long topLevelDomain;
|
||||||
|
private Long query;
|
||||||
|
private Long fragment;
|
||||||
|
private Long redirect;
|
||||||
|
private Long path;
|
||||||
|
private Long redirectChain;
|
||||||
|
private Long hstsHeader;
|
||||||
|
private Long sslStripping;
|
||||||
|
private Long hostnameEmbedding;
|
||||||
|
private Long javascriptCheck;
|
||||||
|
private Long shorteningService;
|
||||||
|
private Long hasIpAddress;
|
||||||
|
private Long trackingDescriptions;
|
||||||
|
private Long urlEncoding;
|
||||||
|
private Long hasExecutable;
|
||||||
|
private Long tls;
|
||||||
|
private Long contents;
|
||||||
|
private String target; // This is the label, may be null if predicting
|
||||||
|
|
||||||
|
// Custom setter for tls (qr_code_type_id)
|
||||||
|
public void setTls(Long tls) {
|
||||||
|
if (tls != null) {
|
||||||
|
this.tls = tls == 1 ? 0 : tls == 9 ? 1 : tls;
|
||||||
|
} else {
|
||||||
|
this.tls = 0L;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Custom setter for hostnameEmbedding and other similar columns
|
||||||
|
public void setHostnameEmbedding(Long hostnameEmbedding) {
|
||||||
|
this.hostnameEmbedding = (hostnameEmbedding != null && hostnameEmbedding != 0) ? 1L : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setJavascriptCheck(Long javascriptCheck) {
|
||||||
|
this.javascriptCheck = (javascriptCheck != null && javascriptCheck != 0) ? 1L : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setShorteningService(Long shorteningService) {
|
||||||
|
this.shorteningService = (shorteningService != null && shorteningService != 0) ? 1L : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHasIpAddress(Long hasIpAddress) {
|
||||||
|
this.hasIpAddress = (hasIpAddress != null && hasIpAddress != 0) ? 1L : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUrlEncoding(Long urlEncoding) {
|
||||||
|
this.urlEncoding = (urlEncoding != null && urlEncoding != 0) ? 1L : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setHasExecutable(Long hasExecutable) {
|
||||||
|
this.hasExecutable = (hasExecutable != null && hasExecutable != 0) ? 1L : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTrackingDescriptions(Long trackingDescriptions) {
|
||||||
|
this.trackingDescriptions = (trackingDescriptions != null && trackingDescriptions != 0) ? 1L : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Custom setter for sslStripping
|
||||||
|
public void setSslStripping(String sslStripping) {
|
||||||
|
if (sslStripping != null && "true".equalsIgnoreCase(sslStripping)) {
|
||||||
|
this.sslStripping = 1L;
|
||||||
|
} else {
|
||||||
|
this.sslStripping = 0L;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Custom setter for hstsHeader
|
||||||
|
public void setHstsHeader(String hstsHeader) {
|
||||||
|
if (hstsHeader == null || "0".equals(hstsHeader)) {
|
||||||
|
this.hstsHeader = 0L;
|
||||||
|
} else if (hstsHeader.startsWith("{") && hstsHeader.endsWith("}")) {
|
||||||
|
Pattern pattern = Pattern.compile("\"(.*?)\"");
|
||||||
|
Matcher matcher = pattern.matcher(hstsHeader);
|
||||||
|
if (matcher.find() && matcher.group(1).toLowerCase().contains("no")) {
|
||||||
|
this.hstsHeader = 0L;
|
||||||
|
} else {
|
||||||
|
this.hstsHeader = 1L;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
this.hstsHeader = 0L;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Custom setters for calculating string lengths
|
||||||
|
public void setDomain(String domain) {
|
||||||
|
this.domain = (domain != null) ? (long) domain.length() : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSubdomain(String subdomain) {
|
||||||
|
this.subdomain = (subdomain != null) ? (long) subdomain.length() : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTopLevelDomain(String topLevelDomain) {
|
||||||
|
this.topLevelDomain = (topLevelDomain != null) ? (long) topLevelDomain.length() : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setQuery(String query) {
|
||||||
|
this.query = (query != null) ? (long) query.length() : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFragment(String fragment) {
|
||||||
|
this.fragment = (fragment != null) ? (long) fragment.length() : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPath(String path) {
|
||||||
|
this.path = (path != null) ? (long) path.length() : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRedirectChain(String redirectChain) {
|
||||||
|
this.redirectChain = (redirectChain != null) ? (long) redirectChain.length() : 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setContents(String contents) {
|
||||||
|
this.contents = (contents != null) ? (long) contents.length() : 0L;
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
src/main/resources/cv_model/bestModel/metadata/.part-00000.crc
Normal file
BIN
src/main/resources/cv_model/bestModel/metadata/.part-00000.crc
Normal file
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.PipelineModel","timestamp":1723422050490,"sparkVersion":"3.4.3","uid":"PipelineModel_4ecdd9f71524","paramMap":{"stageUids":["StringIndexer_d3c63289c493","VectorAssembler_517fc429fbfb","RandomForestClassifier_4909b7ca2bbe"]},"defaultParamMap":{}}
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1723422050930,"sparkVersion":"3.4.3","uid":"StringIndexer_d3c63289c493","paramMap":{"outputCol":"indexed_target","inputCol":"target"},"defaultParamMap":{"stringOrderType":"frequencyDesc","handleInvalid":"error","outputCol":"StringIndexer_d3c63289c493__output"}}
|
||||||
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1723422054241,"sparkVersion":"3.4.3","uid":"VectorAssembler_517fc429fbfb","paramMap":{"inputCols":["domain","subdomain","top_level_domain","query","fragment","redirect","path","redirect_chain","hsts_header","ssl_stripping","hostname_embedding","javascript_check","shortening_service","has_ip_address","tracking_descriptions","url_encoding","has_executable","tls","contents"],"outputCol":"features"},"defaultParamMap":{"handleInvalid":"error","outputCol":"VectorAssembler_517fc429fbfb__output"}}
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.classification.RandomForestClassificationModel","timestamp":1723422054559,"sparkVersion":"3.4.3","uid":"RandomForestClassifier_4909b7ca2bbe","paramMap":{"featuresCol":"features","numTrees":100,"maxDepth":10,"labelCol":"indexed_target"},"defaultParamMap":{"featuresCol":"features","rawPredictionCol":"rawPrediction","maxBins":32,"predictionCol":"prediction","minInstancesPerNode":1,"minWeightFractionPerNode":0.0,"cacheNodeIds":false,"minInfoGain":0.0,"numTrees":20,"maxDepth":5,"impurity":"gini","subsamplingRate":1.0,"leafCol":"","labelCol":"label","probabilityCol":"probability","bootstrap":true,"featureSubsetStrategy":"auto","checkpointInterval":10,"maxMemoryInMB":256,"seed":6182040365248539008},"numFeatures":19,"numClasses":4,"numTrees":100}
|
||||||
Binary file not shown.
Binary file not shown.
BIN
src/main/resources/cv_model/estimator/metadata/.part-00000.crc
Normal file
BIN
src/main/resources/cv_model/estimator/metadata/.part-00000.crc
Normal file
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.Pipeline","timestamp":1723422048367,"sparkVersion":"3.4.3","uid":"Pipeline_58a1fe22f286","paramMap":{"stageUids":["StringIndexer_d3c63289c493","VectorAssembler_517fc429fbfb","RandomForestClassifier_4909b7ca2bbe"]},"defaultParamMap":{}}
|
||||||
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.feature.StringIndexer","timestamp":1723422049018,"sparkVersion":"3.4.3","uid":"StringIndexer_d3c63289c493","paramMap":{"outputCol":"indexed_target","inputCol":"target"},"defaultParamMap":{"stringOrderType":"frequencyDesc","handleInvalid":"error","outputCol":"StringIndexer_d3c63289c493__output"}}
|
||||||
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1723422049580,"sparkVersion":"3.4.3","uid":"VectorAssembler_517fc429fbfb","paramMap":{"inputCols":["domain","subdomain","top_level_domain","query","fragment","redirect","path","redirect_chain","hsts_header","ssl_stripping","hostname_embedding","javascript_check","shortening_service","has_ip_address","tracking_descriptions","url_encoding","has_executable","tls","contents"],"outputCol":"features"},"defaultParamMap":{"handleInvalid":"error","outputCol":"VectorAssembler_517fc429fbfb__output"}}
|
||||||
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.classification.RandomForestClassifier","timestamp":1723422050089,"sparkVersion":"3.4.3","uid":"RandomForestClassifier_4909b7ca2bbe","paramMap":{"featuresCol":"features","labelCol":"indexed_target"},"defaultParamMap":{"featuresCol":"features","rawPredictionCol":"rawPrediction","maxBins":32,"predictionCol":"prediction","minInstancesPerNode":1,"minWeightFractionPerNode":0.0,"cacheNodeIds":false,"minInfoGain":0.0,"numTrees":20,"maxDepth":5,"impurity":"gini","subsamplingRate":1.0,"leafCol":"","labelCol":"label","probabilityCol":"probability","bootstrap":true,"featureSubsetStrategy":"auto","checkpointInterval":10,"maxMemoryInMB":256,"seed":6182040365248539008}}
|
||||||
BIN
src/main/resources/cv_model/evaluator/metadata/.part-00000.crc
Normal file
BIN
src/main/resources/cv_model/evaluator/metadata/.part-00000.crc
Normal file
Binary file not shown.
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator","timestamp":1723422047764,"sparkVersion":"3.4.3","uid":"MulticlassClassificationEvaluator_f31cf4d2b0db","paramMap":{"metricName":"accuracy","labelCol":"indexed_target"},"defaultParamMap":{"eps":1.0E-15,"beta":1.0,"metricName":"f1","predictionCol":"prediction","labelCol":"label","metricLabel":0.0,"probabilityCol":"probability"}}
|
||||||
BIN
src/main/resources/cv_model/metadata/.part-00000.crc
Normal file
BIN
src/main/resources/cv_model/metadata/.part-00000.crc
Normal file
Binary file not shown.
0
src/main/resources/cv_model/metadata/_SUCCESS
Normal file
0
src/main/resources/cv_model/metadata/_SUCCESS
Normal file
1
src/main/resources/cv_model/metadata/part-00000
Normal file
1
src/main/resources/cv_model/metadata/part-00000
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"class":"org.apache.spark.ml.tuning.CrossValidatorModel","timestamp":1723422046660,"sparkVersion":"3.4.3","uid":"CrossValidatorModel_5c96b33c8d82","paramMap":{"seed":-2084793586583917283,"numFolds":5,"foldCol":"","estimatorParamMaps":[[{"parent":"RandomForestClassifier_4909b7ca2bbe","name":"numTrees","value":"100","isJson":"true"},{"parent":"RandomForestClassifier_4909b7ca2bbe","name":"maxDepth","value":"10","isJson":"true"}]]},"defaultParamMap":{"seed":880116102,"numFolds":3,"foldCol":""},"avgMetrics":[0.8736361548764979],"persistSubModels":false}
|
||||||
Reference in New Issue
Block a user