package com.safeqr.app.qrcode.service; import static com.safeqr.app.constants.CommonConstants.*; import com.safeqr.app.qrcode.entity.URLEntity; import com.safeqr.app.qrcode.model.URLModel; import com.safeqr.app.qrcode.repository.URLRepository; import com.safeqr.app.prediction.service.PredictionService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLHandshakeException; import java.io.IOException; import java.net.*; import java.nio.charset.StandardCharsets; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; @Service public class URLVerificationService { private static final int CONNECTION_TIMEOUT_MS = 10000; private static final int READ_TIMEOUT_MS = 10000; private static final Logger logger = LoggerFactory.getLogger(URLVerificationService.class); private final URLRepository urlRepository; private final PredictionService predictionService; @Autowired public URLVerificationService(URLRepository urlRepository, PredictionService predictionService) { this.urlRepository = urlRepository; this.predictionService = predictionService; } // Regular expression pattern for shortening services private static final String SHORTENING_PATTERN = "bit\\.ly|goo\\.gl|shorte\\.st|go2l\\.ink|x\\.co|ow\\.ly|t\\.co|tinyurl|tr\\.im|is\\.gd|cli\\.gs|" + "yfrog\\.com|migre\\.me|ff\\.im|tiny\\.cc|url4\\.eu|twit\\.ac|su\\.pr|twurl\\.nl|snipurl\\.com|" + "short\\.to|BudURL\\.com|ping\\.fm|post\\.ly|Just\\.as|bkite\\.com|snipr\\.com|fic\\.kr|loopt\\.us|" + "doiop\\.com|short\\.ie|kl\\.am|wp\\.me|rubyurl\\.com|om\\.ly|to\\.ly|bit\\.do|t\\.co|lnkd\\.in|" + "db\\.tt|qr\\.ae|adf\\.ly|goo\\.gl|bitly\\.com|cur\\.lv|tinyurl\\.com|ow\\.ly|bit\\.ly|ity\\.im|" + "q\\.gs|is\\.gd|po\\.st|bc\\.vc|twitthis\\.com|u\\.to|j\\.mp|buzurl\\.com|cutt\\.us|u\\.bb|yourls\\.org|" + "x\\.co|prettylinkpro\\.com|scrnch\\.me|filoops\\.info|vzturl\\.com|qr\\.net|1url\\.com|tweez\\.me|v\\.gd|" + "tr\\.im|link\\.zip\\.net"; // Regular expression pattern to match various IP address formats private static final String IP_PATTERN = "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\." + "([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|" + "(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\." + "([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|" + "((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)" + "(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}|" + "([0-9]+(?:\\.[0-9]+){3}:[0-9]+)|" + "((?:(?:\\d|[01]?\\d\\d|2[0-4]\\d|25[0-5])\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d|\\d)(?:\\/\\d{1,2})?)"; // Define a Set of suspicious file extensions private static final Set SUSPICIOUS_EXTENSIONS = Stream.of( ".exe", ".bat", ".sh", ".cmd", ".scr", ".pif", ".application", ".gadget", ".vb", ".vbs", ".js", ".jse", ".ws", ".wsf", ".msc", ".cpl", ".msi", ".ps1", ".py", ".pyc", ".pyo", ".rb", ".bin", ".run" ).collect(Collectors.toUnmodifiableSet()); // Checks if the URL has executable file public String hasExecutableFile(String urlPath) { return Stream.of(urlPath) .map(String::toLowerCase) .map(path -> { int lastDotIndex = path.lastIndexOf('.'); if (lastDotIndex != -1) { return path.substring(lastDotIndex); } return path.contains(".") || path.endsWith("/") ? null : ""; }) .filter(Objects::nonNull) .map(extension -> SUSPICIOUS_EXTENSIONS.contains(extension) || extension.isEmpty() ? "Yes" : "") .findFirst() .orElse(""); } public URLEntity getURLEntityByQRCodeId(UUID qrCodeId) { logger.info("qrCodeId retrieving: {}", qrCodeId); // return urlRepository.findByQrCodeId(qrCodeId) // .orElseThrow(() -> new ResourceNotFoundExceptions("URL not found for QR Code id: " + qrCodeId)); return urlRepository.findByQrCodeId(qrCodeId).orElse(null); } public void insertDB(URLEntity urlEntity) { urlRepository.save(urlEntity); } // Function to breakdown URL into subdomain, domain, topLevelDomain, query params, fragment public URLEntity breakdownURL(String urlString) { URLEntity urlObj = new URLEntity(); try { //URL url = new URI(encodeUrl(urlString)).toURL(); URL url = new URI(urlString.replace(" ", "")).toURL(); // Check for URL encoding in path and query String query = parseQueryParams(url.getQuery()); String pathEncoding = checkURLEncoding(url.getPath()); String queryEncoding = query != null ? checkURLEncoding(query) : ""; // Combine encoding results urlObj.setUrlEncoding(pathEncoding.equals("Yes") || queryEncoding.equals("Yes") ? "Yes" : ""); // encode url before proceeding the rest of the checks url = new URI(encodeUrl(urlString)).toURL(); String host = url.getHost(); populateHostDetails(host, urlObj); // Check for deceptive URL urlObj.setHostnameEmbedding(checkDeceptiveUrl(url)); // Check for Javascript code in url urlObj.setJavascriptCheck(checkForJavascriptCode(urlString)); // Check for url shortener urlObj.setShorteningService(hasShorteningService(urlString)); // Check for IP address urlObj.setHasIpAddress(hasIPAddress(urlString)); // Check for suspicious file extensions urlObj.setHasExecutable(hasExecutableFile(urlString)); urlObj.setPath(Optional.ofNullable(url.getPath()).filter(p -> !p.isEmpty()).orElse("")); urlObj.setQuery(parseQueryParams(url.getQuery())); urlObj.setFragment(Optional.ofNullable(url.getRef()).orElse("")); // Check for tracking parameters urlObj.setTrackingDescriptions(getTrackingDescriptions(url.getQuery())); } catch (Exception e) { logger.error("Error in breaking down URL: {}", e.getMessage()); e.printStackTrace(); } return urlObj; } private void populateHostDetails(String host, URLEntity urlObj) { logger.info("Host: {}", host); if (host != null && !host.isEmpty()) { if (isIpAddress(host)) { // Handle IP address urlObj.setDomain(host); urlObj.setTopLevelDomain(""); // No TLD for IP addresses urlObj.setSubdomain(""); // No subdomain for IP addresses } else { // Handle regular domain name String[] hostParts = host.split("\\."); int length = hostParts.length; if (length >= 2) { urlObj.setTopLevelDomain(hostParts[length - 1]); // TLD, e.g., "com" urlObj.setDomain(hostParts[length - 2]); // Domain, e.g., "example" urlObj.setSubdomain(length > 2 ? String.join(".", Arrays.copyOfRange(hostParts, 0, length - 2)) : ""); } else if (length == 1) { // Handle cases like 'localhost' where there's no TLD urlObj.setDomain(hostParts[0]); urlObj.setTopLevelDomain(""); // No TLD urlObj.setSubdomain(""); // No subdomain } } } } // List of common tracking parameters with their descriptions private static final Map TRACKING_DESCRIPTIONS = Map.ofEntries( Map.entry("utm_source", "Campaign Source: Identifies which site sent the traffic."), Map.entry("utm_medium", "Campaign Medium: Identifies what type of link was used."), Map.entry("utm_campaign", "Campaign Name: Identifies a specific product promotion or campaign."), Map.entry("utm_term", "Campaign Term: Identifies search terms."), Map.entry("utm_content", "Campaign Content: Differentiates similar content or links within the same ad."), Map.entry("gclid", "Google Click Identifier: Used by Google Ads to track clicks."), Map.entry("fbclid", "Facebook Click Identifier: Used by Facebook to track clicks."), Map.entry("tracking_id", "Tracking ID: General identifier for tracking purposes."), Map.entry("affiliate_id", "Affiliate ID: Identifies traffic from affiliates."), Map.entry("ref", "Referrer: Identifies the referrer site."), Map.entry("referrer", "Referrer: Identifies the referrer site.") ); // Regex pattern to capture key-value pairs in the query string private static final Pattern PARAM_PATTERN = Pattern.compile( "(?[^=&]+)=(?[^&]+)", Pattern.CASE_INSENSITIVE ); // Static method to detect and return tracking parameter descriptions in a URL private List getTrackingDescriptions(String query) { if (query == null || query.isEmpty()) { return Collections.emptyList(); } Matcher matcher = PARAM_PATTERN.matcher(query); List foundDescriptions = new ArrayList<>(); while (matcher.find()) { String key = matcher.group("key").toLowerCase(); String value = URLDecoder.decode(matcher.group("value"), StandardCharsets.UTF_8); if (TRACKING_DESCRIPTIONS.containsKey(key)) { foundDescriptions.add(TRACKING_DESCRIPTIONS.get(key) + ": " + value); } } return foundDescriptions; } private int checkDeceptiveUrl(URL url) { String[] parts = url.getHost().split("\\."); if (parts.length < 3) return 0; Set commonTlds = new HashSet<>(Arrays.asList("com", "org", "net", "edu", "gov")); for (int i = parts.length - 2; i >= 1; i--) { if (commonTlds.contains(parts[i]) && !commonTlds.contains(parts[i - 1]) && i != parts.length - 2) { logger.warn("Potentially deceptive URL detected: {} (Suspicious domain: {}.{})", url, parts[i - 1], parts[i]); return 1; } } return 0; } private String checkForJavascriptCode(String url) { // Decode the URL String decodedUrl = URLDecoder.decode(url, StandardCharsets.UTF_8); // Patterns to detect 'javascript:', '