From 3883a4449ef266c32136afa49eb027520fe15c68 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Fri, 6 Jun 2025 13:58:25 +0000 Subject: [PATCH] Jules was unable to complete the task in time. Please review the work done so far and provide feedback for Jules to continue. --- src/functions/extract_domain.cpp | 48 +++++++++--- src/functions/extract_extension.cpp | 3 +- src/functions/extract_host.cpp | 3 +- src/functions/extract_path.cpp | 5 +- src/functions/extract_port.cpp | 3 +- src/functions/extract_query.cpp | 13 +++- src/functions/extract_schema.cpp | 3 +- src/functions/extract_subdomain.cpp | 30 ++++++-- src/functions/extract_tld.cpp | 30 ++++++-- src/functions/get_tranco.cpp | 113 +++++++++++++++++++--------- src/netquack_extension.cpp | 20 ++--- src/utils/ip_utils.cpp | 61 +++++++++------ src/utils/ip_utils.hpp | 10 +-- src/utils/logger.cpp | 104 +++++++++++++++++-------- src/utils/utils.cpp | 37 +++++++-- 15 files changed, 336 insertions(+), 147 deletions(-) diff --git a/src/functions/extract_domain.cpp b/src/functions/extract_domain.cpp index 5c25ffa..02ba570 100644 --- a/src/functions/extract_domain.cpp +++ b/src/functions/extract_domain.cpp @@ -32,7 +32,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting domain: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } } } @@ -45,18 +46,26 @@ namespace duckdb Connection con (db); // Extract the host from the URL - std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/|mailto:)?((?:[^\/\?:#@]+@)?([^\/\?:#]+)))"); + // This regex captures the host, excluding protocol, path, query, fragment, and port. + // It explicitly excludes '/', '\s', '#', '?', ':' from the host. + std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))"); std::smatch host_match; - if (!std::regex_search (input, host_match, host_regex)) + std::string host_str; // Use a separate string for the matched host + + // Search for the host in the input string + // No need for searchable_input, regex_search can take input directly + if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1) { - return ""; + host_str = host_match[1].str (); + } + else + { + return ""; // No host found } - - auto host = host_match[host_match.size () - 1].str (); // Split the host into parts std::vector parts; - std::istringstream stream (host); + std::istringstream stream (host_str); std::string part; while (std::getline (stream, part, '.')) { @@ -65,8 +74,29 @@ namespace duckdb // Find the longest matching public suffix std::string public_suffix; - int public_suffix_index = -1; - + int public_suffix_index = -1; // Using -1 to indicate no valid public suffix part found yet + + // Iterate through all possible suffix combinations, from shortest to longest. + // The goal is to find the longest known public suffix. + // For example, for 'a.b.c.co.uk', it will test: + // uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk + // If 'co.uk' is a public suffix, it will be matched. + // If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched. + // The last and longest match is chosen by breaking after the first DB match, + // assuming suffixes are ordered or queried appropriately by the PSL logic. + // However, the original loop structure implies checking all parts and + // the longest one that is a PSL entry should be chosen. + // The current logic takes the *first* match from the right that is a PSL entry. + // Let's refine the comment to reflect the actual loop behavior. + + // Iterate through parts of the hostname from right to left to find the longest public suffix. + // For 'a.b.c.co.uk', it will form candidates: + // 1. uk + // 2. co.uk + // 3. c.co.uk + // 4. b.c.co.uk + // 5. a.b.c.co.uk + // It stops at the first and longest valid suffix found in the public_suffix_list. for (size_t j = 0; j < parts.size (); j++) { // Build the candidate suffix diff --git a/src/functions/extract_extension.cpp b/src/functions/extract_extension.cpp index 920a9b5..3e29713 100644 --- a/src/functions/extract_extension.cpp +++ b/src/functions/extract_extension.cpp @@ -26,7 +26,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting extension: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } }; } diff --git a/src/functions/extract_host.cpp b/src/functions/extract_host.cpp index 1b18910..9e61633 100644 --- a/src/functions/extract_host.cpp +++ b/src/functions/extract_host.cpp @@ -26,7 +26,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting host: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } } } diff --git a/src/functions/extract_path.cpp b/src/functions/extract_path.cpp index 5b6ff8b..601266c 100644 --- a/src/functions/extract_path.cpp +++ b/src/functions/extract_path.cpp @@ -15,8 +15,8 @@ namespace duckdb for (idx_t i = 0; i < args.size (); i++) { + // Paths are often case-sensitive, so we don't convert to lowercase. auto input = input_vector.GetValue (i).ToString (); - std::transform (input.begin (), input.end (), input.begin (), ::tolower); try { @@ -26,7 +26,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting path: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } }; } diff --git a/src/functions/extract_port.cpp b/src/functions/extract_port.cpp index d42b5f3..2800de2 100644 --- a/src/functions/extract_port.cpp +++ b/src/functions/extract_port.cpp @@ -26,7 +26,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting port: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } }; } diff --git a/src/functions/extract_query.cpp b/src/functions/extract_query.cpp index 0040636..db9a31b 100644 --- a/src/functions/extract_query.cpp +++ b/src/functions/extract_query.cpp @@ -25,7 +25,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting query string: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } }; } @@ -36,9 +37,13 @@ namespace duckdb { // Regex to match the query string component of a URL // Explanation: - // (?:\?|&) - Non-capturing group to match either "?" (start of query) or "&" (query parameter separator) - // ([^#]+) - Capturing group to match the query string (any characters except "#") - std::regex query_regex (R"((?:\?|&)([^#]+))"); + // \? - Matches the literal '?' character. + // ([^#]*) - Capturing group: + // [^#] - Matches any character that is NOT a '#'. + // * - Matches the previous character zero or more times. + // This regex captures content after the first '?' up to a '#' or end of string. + // Does not handle query parameters in fragments. + std::regex query_regex (R"(\?([^#]*))"); std::smatch query_match; // Use regex_search to find the query string in the input diff --git a/src/functions/extract_schema.cpp b/src/functions/extract_schema.cpp index 1049a06..d9646bc 100644 --- a/src/functions/extract_schema.cpp +++ b/src/functions/extract_schema.cpp @@ -26,7 +26,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting schema: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } }; } diff --git a/src/functions/extract_subdomain.cpp b/src/functions/extract_subdomain.cpp index c97516d..d9f915d 100644 --- a/src/functions/extract_subdomain.cpp +++ b/src/functions/extract_subdomain.cpp @@ -28,7 +28,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting subdomain: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } } } @@ -43,18 +44,25 @@ namespace duckdb Connection con (db); // Extract the host from the URL - std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))"); + // This regex captures the host, excluding protocol, path, query, fragment, and port. + // It explicitly excludes '/', '\s', '#', '?', ':' from the host. + std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))"); std::smatch host_match; - if (!std::regex_search (input, host_match, host_regex)) + std::string host_str; + + // No need for searchable_input, regex_search can take input directly + if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1) { - return ""; + host_str = host_match[1].str (); + } + else + { + return ""; // No host found } - - auto host = host_match[1].str (); // Split the host into parts std::vector parts; - std::istringstream stream (host); + std::istringstream stream (host_str); std::string part; while (std::getline (stream, part, '.')) { @@ -65,6 +73,14 @@ namespace duckdb std::string public_suffix; int public_suffix_index = -1; + // Iterate through all possible suffix combinations, from shortest to longest. + // The goal is to find the longest known public suffix. + // For example, for 'a.b.c.co.uk', it will test: + // uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk + // If 'co.uk' is a public suffix, it will be matched. + // If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched. + // The last and longest match is chosen. + // The current logic takes the *first* match from the right that is a PSL entry. for (size_t j = 0; j < parts.size (); j++) { // Build the candidate suffix diff --git a/src/functions/extract_tld.cpp b/src/functions/extract_tld.cpp index 6f3e727..e889460 100644 --- a/src/functions/extract_tld.cpp +++ b/src/functions/extract_tld.cpp @@ -28,7 +28,8 @@ namespace duckdb } catch (const std::exception &e) { - result_data[i] = "Error extracting tld: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } } } @@ -43,18 +44,25 @@ namespace duckdb Connection con (db); // Extract the host from the URL - std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\?:]+))"); + // This regex captures the host, excluding protocol, path, query, fragment, and port. + // It explicitly excludes '/', '\s', '#', '?', ':' from the host. + std::regex host_regex (R"(^(?:(?:https?|ftp|rsync):\/\/)?([^\/\s#?:]+))"); std::smatch host_match; - if (!std::regex_search (input, host_match, host_regex)) + std::string host_str; + + // No need for searchable_input, regex_search can take input directly + if (std::regex_search (input, host_match, host_regex) && host_match.size () > 1) { - return ""; + host_str = host_match[1].str (); + } + else + { + return ""; // No host found } - - auto host = host_match[1].str (); // Split the host into parts std::vector parts; - std::istringstream stream (host); + std::istringstream stream (host_str); std::string part; while (std::getline (stream, part, '.')) { @@ -64,6 +72,14 @@ namespace duckdb // Find the longest matching public suffix std::string public_suffix; + // Iterate through all possible suffix combinations, from shortest to longest. + // The goal is to find the longest known public suffix. + // For example, for 'a.b.c.co.uk', it will test: + // uk, co.uk, c.co.uk, b.c.co.uk, a.b.c.co.uk + // If 'co.uk' is a public suffix, it will be matched. + // If 'c.co.uk' is also a public suffix (e.g. *.sch.uk), that would be matched. + // The last and longest match is chosen. + // The current logic takes the *first* match from the right that is a PSL entry. for (size_t j = 0; j < parts.size (); j++) { // Build the candidate suffix diff --git a/src/functions/get_tranco.cpp b/src/functions/get_tranco.cpp index f53bedd..2781402 100644 --- a/src/functions/get_tranco.cpp +++ b/src/functions/get_tranco.cpp @@ -63,50 +63,69 @@ namespace duckdb // Construct the file name std::string temp_file = "tranco_list_" + std::string (date) + ".csv"; - // Download the file if it doesn't exist or if force is true - std::ifstream file (temp_file); - if (force) - { - // Remove the old file if it exists - if (file.good ()) - { - remove (temp_file.c_str ()); + // Need file_exists from utils.hpp - ensure it's available. + // #include "../utils/utils.hpp" should already be there. + // #include "../utils/logger.hpp" for LogMessage + + bool file_needs_download = false; + if (force) { + LogMessage(LogLevel::INFO, "Force update for Tranco list: " + temp_file); + if (file_exists(temp_file.c_str())) { + if (remove(temp_file.c_str()) != 0) { + LogMessage(LogLevel::WARNING, "Failed to remove existing Tranco file during force update: " + temp_file + ". Proceeding with download attempt."); + } } - // Get the download code - std::string download_code = GetTrancoDownloadCode (date); + file_needs_download = true; + } else { + if (!file_exists(temp_file.c_str())) { + LogMessage(LogLevel::INFO, "Tranco list " + temp_file + " not found locally."); + file_needs_download = true; + } else { + LogMessage(LogLevel::INFO, "Found existing Tranco list: " + temp_file + ". Use force=true to re-download."); + } + } - // Construct the download URL - std::string download_url = "https://tranco-list.eu/download/" + download_code + "/full"; + if (file_needs_download) { + LogMessage(LogLevel::INFO, "Attempting to download Tranco list to: " + temp_file); + std::string download_code = GetTrancoDownloadCode(date); + + if (download_code.empty()) { + LogMessage(LogLevel::CRITICAL, "Failed to obtain Tranco download code for date: " + std::string(date) + ". Aborting download."); + return; // Critical log would throw, but explicit return for clarity. + } - LogMessage (LogLevel::INFO, "Download Tranco list: " + download_url); + std::string download_url = "https://tranco-list.eu/download/" + download_code + "/full"; + LogMessage(LogLevel::INFO, "Downloading Tranco list from: " + download_url); - // Download the CSV file to a temporary file - CURL *curl = CreateCurlHandler (); + CURL *curl = CreateCurlHandler(); // Assuming CreateCurlHandler is available CURLcode res; - FILE *file = fopen (temp_file.c_str (), "wb"); - if (!file) - { - curl_easy_cleanup (curl); - LogMessage (LogLevel::CRITICAL, "Failed to create temporary file for Tranco list."); + FILE *fp = fopen(temp_file.c_str(), "wb"); // Changed variable name from 'file' to 'fp' to avoid clash + if (!fp) { + if(curl) curl_easy_cleanup(curl); // Cleanup curl if fp failed + LogMessage(LogLevel::CRITICAL, "Failed to create temporary file for Tranco list: " + temp_file); + return; } - curl_easy_setopt (curl, CURLOPT_URL, download_url.c_str ()); - curl_easy_setopt (curl, CURLOPT_WRITEDATA, file); - res = curl_easy_perform (curl); - curl_easy_cleanup (curl); - fclose (file); + curl_easy_setopt(curl, CURLOPT_URL, download_url.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); + res = curl_easy_perform(curl); - if (res != CURLE_OK) - { - remove (temp_file.c_str ()); // Clean up the temporary file - LogMessage (LogLevel::ERROR, std::string (curl_easy_strerror (res))); - LogMessage (LogLevel::CRITICAL, "Failed to download Tranco list. Check logs for details."); + fclose(fp); // Close file pointer + curl_easy_cleanup(curl); + + if (res != CURLE_OK) { + remove(temp_file.c_str()); // Clean up the temporary file on download error + LogMessage(LogLevel::ERROR, "CURL error downloading Tranco list: " + std::string(curl_easy_strerror(res))); + LogMessage(LogLevel::CRITICAL, "Failed to download Tranco list. Check logs for details."); + return; } + LogMessage(LogLevel::INFO, "Tranco list downloaded successfully: " + temp_file); } - if (!file.good ()) - { - LogMessage (LogLevel::CRITICAL, "Tranco list `" + temp_file + "` not found. Download it first using `SELECT update_tranco(true);`"); + // Final check before trying to use the file + if (!file_exists(temp_file.c_str())) { + LogMessage(LogLevel::CRITICAL, "Tranco list `" + temp_file + "` not found or download failed. If you forced update, check logs. Otherwise, try `SELECT update_tranco(true);`"); + return; } // Parse the CSV data and insert into a table @@ -163,6 +182,11 @@ namespace duckdb if (table_exists->RowCount () == 0) { LogMessage (LogLevel::CRITICAL, "Tranco table not found. Download it first using `SELECT update_tranco(true);`"); + // Add this loop to set all results to NULL and return: + for (idx_t i = 0; i < args.size(); i++) { + FlatVector::SetNull(result, i, true); + } + return; // Exit the function early } // Extract the input from the arguments @@ -181,11 +205,16 @@ namespace duckdb auto query_result = con.Query (query); auto rank = query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value (); - result_data[i] = StringVector::AddString (result, rank.ToString ()); + if (rank.IsNull()) { + FlatVector::SetNull(result, i, true); + } else { + result_data[i] = StringVector::AddString (result, rank.ToString ()); + } } catch (const std::exception &e) { - result_data[i] = "Error extracting tranco rank: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } } } @@ -202,6 +231,11 @@ namespace duckdb if (table_exists->RowCount () == 0) { LogMessage (LogLevel::CRITICAL, "Tranco table not found. Download it first using `SELECT update_tranco(true);`"); + // Add this loop to set all results to NULL and return: + for (idx_t i = 0; i < args.size(); i++) { + FlatVector::SetNull(result, i, true); + } + return; // Exit the function early } // Extract the input from the arguments @@ -220,11 +254,16 @@ namespace duckdb auto query_result = con.Query (query); auto category = query_result->RowCount () > 0 ? query_result->GetValue (0, 0) : Value (); - result_data[i] = StringVector::AddString (result, category.ToString ()); + if (category.IsNull()) { + FlatVector::SetNull(result, i, true); + } else { + result_data[i] = StringVector::AddString (result, category.ToString ()); + } } catch (const std::exception &e) { - result_data[i] = "Error extracting tranco category: " + std::string (e.what ()); + // Set NULL on error + FlatVector::SetNull (result, i, true); } } } diff --git a/src/netquack_extension.cpp b/src/netquack_extension.cpp index 91ddd9f..bc9e715 100644 --- a/src/netquack_extension.cpp +++ b/src/netquack_extension.cpp @@ -32,7 +32,7 @@ namespace duckdb ExtensionUtil::RegisterExtension ( instance, "netquack", - { "Parsing, extracting, and analyzing domains, URIs, and paths with ease." }); + { "Tools for parsing, analyzing, and extracting information from URLs, domains, IP addresses, and network lists (like Tranco)." }); auto netquack_extract_domain_function = ScalarFunction ( "extract_domain", @@ -111,38 +111,38 @@ namespace duckdb netquack::UpdateTrancoListFunction); ExtensionUtil::RegisterFunction (instance, netquack_update_tranco_function); - auto get_tranco_rank_function = ScalarFunction ( + auto netquack_get_tranco_rank_function = ScalarFunction ( "get_tranco_rank", { LogicalType::VARCHAR }, LogicalType::VARCHAR, netquack::GetTrancoRankFunction); - ExtensionUtil::RegisterFunction (instance, get_tranco_rank_function); + ExtensionUtil::RegisterFunction (instance, netquack_get_tranco_rank_function); - auto get_tranco_rank_category_function = ScalarFunction ( + auto netquack_get_tranco_rank_category_function = ScalarFunction ( "get_tranco_rank_category", { LogicalType::VARCHAR }, LogicalType::VARCHAR, netquack::GetTrancoRankCategoryFunction); - ExtensionUtil::RegisterFunction (instance, get_tranco_rank_category_function); + ExtensionUtil::RegisterFunction (instance, netquack_get_tranco_rank_category_function); - auto ipcalc_function = TableFunction ( + auto netquack_ipcalc_function = TableFunction ( "ipcalc", { LogicalType::VARCHAR }, nullptr, netquack::IPCalcFunc::Bind, nullptr, netquack::IPCalcFunc::InitLocal); - ipcalc_function.in_out_function = netquack::IPCalcFunc::Function; - ExtensionUtil::RegisterFunction (instance, ipcalc_function); + netquack_ipcalc_function.in_out_function = netquack::IPCalcFunc::Function; + ExtensionUtil::RegisterFunction (instance, netquack_ipcalc_function); - auto version_function = TableFunction ( + auto netquack_version_function = TableFunction ( "netquack_version", {}, netquack::VersionFunc::Scan, netquack::VersionFunc::Bind, netquack::VersionFunc::InitGlobal, netquack::VersionFunc::InitLocal); - ExtensionUtil::RegisterFunction (instance, version_function); + ExtensionUtil::RegisterFunction (instance, netquack_version_function); } void NetquackExtension::Load (DuckDB &db) diff --git a/src/utils/ip_utils.cpp b/src/utils/ip_utils.cpp index b0822b9..60663b8 100644 --- a/src/utils/ip_utils.cpp +++ b/src/utils/ip_utils.cpp @@ -58,20 +58,24 @@ namespace duckdb info.hostsPerNet = getHostsPerNet (maskBits); info.ipClass = getIPClass (ip); - if (maskBits != 32) - { - info.network = getNetworkAddress (ip, subnetMask, maskBits); - info.broadcast = getBroadcastAddress (info.network, wildcardMask); - info.hostMin = getHostMin (info.network); - info.hostMax = getHostMax (info.broadcast); - } - else - { - // For /32, the IP itself is the only host (Hostroute) - info.network = ip; - info.broadcast = '-'; - info.hostMin = '-'; - info.hostMax = '-'; + // Store the network IP without CIDR suffix temporarily for calculations + std::string network_ip_only = getNetworkAddress(ip, subnetMask, maskBits); + info.network = network_ip_only + "/" + std::to_string(maskBits); // Store with CIDR + + if (maskBits == 32) { + info.broadcast = "-"; // No traditional broadcast + info.hostMin = network_ip_only; // The IP itself is the only host + info.hostMax = network_ip_only; // The IP itself is the only host + } else if (maskBits == 31) { + // For /31, the two addresses in the range are the network and broadcast addresses, + // and both are considered usable hosts (RFC 3021). + info.hostMin = network_ip_only; // First IP in /31 + info.broadcast = getBroadcastAddress(network_ip_only, wildcardMask); // Calculate actual broadcast (second IP) + info.hostMax = info.broadcast; // The second IP in the /31 range + } else { // For masks /0 to /30 + info.broadcast = getBroadcastAddress(network_ip_only, wildcardMask); + info.hostMin = getHostMin(network_ip_only); + info.hostMax = getHostMax(info.broadcast); } return info; @@ -126,16 +130,18 @@ namespace duckdb return wildcard; } - std::string IPCalculator::getNetworkAddress (const std::string &ip, const std::string &subnetMask, const int &maskBits) + // Note: maskBits parameter is changed from const int& to int + std::string IPCalculator::getNetworkAddress (const std::string &ip, const std::string &subnetMask, int maskBits) { auto ipOctets = parseIP (ip); auto maskOctets = parseIP (subnetMask); - std::string network; + std::string network_ip_str; for (int i = 0; i < 4; ++i) { - network += std::to_string (ipOctets[i] & maskOctets[i]) + (i < 3 ? "." : ""); + network_ip_str += std::to_string (ipOctets[i] & maskOctets[i]) + (i < 3 ? "." : ""); } - return network + "/" + std::to_string (maskBits); + // Return only the IP string, not with /maskBits + return network_ip_str; } std::string IPCalculator::getBroadcastAddress (const std::string &networkAddress, const std::string &wildcardMask) @@ -166,9 +172,22 @@ namespace duckdb int IPCalculator::getHostsPerNet (int maskBits) { - if (maskBits == 32) - return 1; // Special case for /32 - return (1 << (32 - maskBits)) - 2; + // Basic validation, though typically done before calling this + if (maskBits < 0 || maskBits > 32) { + throw std::invalid_argument("Subnet mask must be between 0 and 32"); + } + + if (maskBits == 32) { + return 1; // The IP itself is considered the only host + } + if (maskBits == 31) { + return 2; // For point-to-point links, as per RFC 3021 + } + // For networks /0 through /30 + // Number of addresses in the subnet is 2^(32-maskBits) + // Subtract 2 for network and broadcast addresses + uint64_t total_ips = 1ULL << (32 - maskBits); + return static_cast(total_ips - 2); } std::string IPCalculator::getIPClass (const std::string &ip) diff --git a/src/utils/ip_utils.hpp b/src/utils/ip_utils.hpp index 591875c..ca508e2 100644 --- a/src/utils/ip_utils.hpp +++ b/src/utils/ip_utils.hpp @@ -28,18 +28,18 @@ namespace duckdb private: static bool isValidInput (const std::string &input); - static bool isValidIP (const std::string &ip); - static std::array parseIP (const std::string &ip); + static bool isValidIP (const std::string &ip); // Already effectively const + static std::array parseIP (const std::string &ip); // ip is not modified static std::string getSubnetMask (int maskBits); static std::string getWildcardMask (const std::string &subnetMask); - static std::string getNetworkAddress (const std::string &ip, const std::string &subnetMask, const int &maskBits); + static std::string getNetworkAddress (const std::string &ip, const std::string &subnetMask, int maskBits); // pass int by value static std::string getBroadcastAddress (const std::string &networkAddress, const std::string &wildcardMask); static std::string getHostMin (const std::string &networkAddress); static std::string getHostMax (const std::string &broadcastAddress); static int getHostsPerNet (int maskBits); static std::string getIPClass (const std::string &ip); - static std::string intToIP (uint32_t ip); - static std::string intToIP (const std::array &octets); + static std::string intToIP (uint32_t ip); // Already by value + static std::string intToIP (const std::array &octets); // octets is not modified }; } // namespace netquack } // namespace duckdb diff --git a/src/utils/logger.cpp b/src/utils/logger.cpp index 98bfb36..e185bfd 100644 --- a/src/utils/logger.cpp +++ b/src/utils/logger.cpp @@ -3,8 +3,17 @@ #include "logger.hpp" #include -#include +#include // For std::put_time #include +#include // For std::mutex +#include // For std::ostringstream +#include // For std::time_t, std::tm + +#ifdef _WIN32 +#include +#else +#include // For POSIX version if needed for other things +#endif namespace duckdb { @@ -37,54 +46,83 @@ namespace duckdb std::string getCurrentTimestamp () { - auto now = std::time (nullptr); - auto tm = *std::localtime (&now); + auto now = std::time(nullptr); + std::tm tm_buf; + #ifdef _WIN32 + localtime_s(&tm_buf, &now); + #else // POSIX + localtime_r(&now, &tm_buf); + #endif std::ostringstream oss; - oss << std::put_time (&tm, "%Y-%m-%d %H:%M:%S"); - return oss.str (); + oss << std::put_time(&tm_buf, "%Y-%m-%d %H:%M:%S"); + return oss.str(); } + namespace { // Anonymous namespace for internal linkage + struct LoggerState { + std::ofstream log_file_stream; + std::mutex log_mutex; + LogLevel min_log_level; + + LoggerState() : min_log_level(getLogLevelFromEnv()) { + log_file_stream.open("netquack.log", std::ios_base::app); + if (!log_file_stream.is_open()) { + std::cerr << "Failed to open log file: netquack.log" << std::endl; + } + } + + // Optional: Destructor to close file if needed + // ~LoggerState() { + // if (log_file_stream.is_open()) { + // log_file_stream.close(); + // } + // } + }; + + LoggerState& getLoggerState() { + static LoggerState instance; // Meyers singleton + return instance; + } + } // anonymous namespace + void LogMessage (LogLevel level, const std::string &message) { - static const LogLevel min_log_level = getLogLevelFromEnv (); + LoggerState &logger = getLoggerState(); - // Skip if message level is below minimum configured level - if (level < min_log_level) - { + if (level < logger.min_log_level) { return; } const char *level_str = ""; - switch (level) - { - case LogLevel::DEBUG: level_str = "DEBUG"; break; - case LogLevel::INFO: level_str = "INFO"; break; - case LogLevel::WARNING: level_str = "WARNING"; break; - case LogLevel::ERROR: level_str = "ERROR"; break; - case LogLevel::CRITICAL: level_str = "CRITICAL"; break; + switch (level) { + case LogLevel::DEBUG: level_str = "DEBUG"; break; + case LogLevel::INFO: level_str = "INFO"; break; + case LogLevel::WARNING: level_str = "WARNING"; break; + case LogLevel::ERROR: level_str = "ERROR"; break; + case LogLevel::CRITICAL: level_str = "CRITICAL"; break; } - std::ofstream log_file ("netquack.log", std::ios_base::app); - if (!log_file.is_open ()) - { - std::cerr << "Failed to open log file!" << std::endl; - return; - } + std::string timestamp = getCurrentTimestamp(); // Get timestamp once - log_file << "[" << getCurrentTimestamp () << "] " - << level_str << " - " - << message << std::endl; + // Lock for file operations and critical section + std::lock_guard guard(logger.log_mutex); - // Also output to stderr for error levels - if (level >= LogLevel::ERROR) - { - std::cerr << "[" << level_str << "] " << message << std::endl; + if (logger.log_file_stream.is_open()) { + logger.log_file_stream << "[" << timestamp << "] " + << level_str << " - " + << message << std::endl; } - // Throw exception for critical errors - if (level == LogLevel::CRITICAL) - { - throw std::runtime_error (message); + if (level >= LogLevel::ERROR) { + // std::cerr is generally thread-safe at the line level, but ensuring timestamp consistency + std::cerr << "[" << timestamp << "] [" << level_str << "] " << message << std::endl; + } + + if (level == LogLevel::CRITICAL) { + if (logger.log_file_stream.is_open()) { + logger.log_file_stream.flush(); // Ensure critical log is written + } + throw std::runtime_error(message); } } } // namespace netquack diff --git a/src/utils/utils.cpp b/src/utils/utils.cpp index 4fd9087..12f214e 100644 --- a/src/utils/utils.cpp +++ b/src/utils/utils.cpp @@ -41,9 +41,11 @@ namespace duckdb } const char *ca_info = std::getenv ("CURL_CA_INFO"); + bool ca_info_was_env = true; // Flag to track if ca_info came from env #if !defined(_WIN32) && !defined(__APPLE__) if (!ca_info) { + ca_info_was_env = false; // Check for common CA certificate bundle locations on Linux for (const auto *path : { "/etc/ssl/certs/ca-certificates.crt", // Debian/Ubuntu/Gentoo etc. @@ -57,6 +59,7 @@ namespace duckdb if (file_exists (path)) { ca_info = path; + LogMessage(LogLevel::DEBUG, "Auto-detected CA certificate bundle: " + std::string(ca_info)); break; } } @@ -66,9 +69,12 @@ namespace duckdb curl_easy_setopt (curl, CURLOPT_WRITEFUNCTION, WriteCallback); if (ca_info) { + if (ca_info_was_env) { + LogMessage(LogLevel::DEBUG, "Using CA certificate bundle from CURL_CA_INFO: " + std::string(ca_info)); + } + // LogMessage for auto-detected is now inside the loop above. // Set the custom CA certificate bundle file // https://github.com/hatamiarash7/duckdb-netquack/issues/6 - LogMessage (LogLevel::DEBUG, "Using custom CA certificate bundle: " + std::string (ca_info)); curl_easy_setopt (curl, CURLOPT_CAINFO, ca_info); } const char *ca_path = std::getenv ("CURL_CA_PATH"); @@ -153,23 +159,38 @@ namespace duckdb std::istringstream stream (list_data); std::string line; con.Query ("CREATE OR REPLACE TABLE public_suffix_list (suffix VARCHAR)"); + Appender appender(con, "public_suffix_list"); // Get an appender for the table while (std::getline (stream, line)) { // Skip comments and empty lines - if (line.empty () || line[0] == '/' || line[0] == ' ') + if (line.empty () || line[0] == '/' || line[0] == ' ') { continue; + } // Replace `*.` with an empty string - size_t wildcard_pos = line.find ("*."); - if (wildcard_pos != std::string::npos) - { - line.replace (wildcard_pos, 2, ""); + // The current understanding is that if "*.foo.bar" is a rule, "foo.bar" is the suffix. + size_t wildcard_pos = line.find("*."); + if (wildcard_pos != std::string::npos) { + line.replace(wildcard_pos, 2, ""); + } + + // Remove leading/trailing whitespace that might remain or be part of original list + // Using isspace from might be more robust for various whitespace chars + // For now, using the provided list of whitespace characters. + line.erase(0, line.find_first_not_of(" \t\n\r\f\v")); + line.erase(line.find_last_not_of(" \t\n\r\f\v") + 1); + + // Skip if line becomes empty after processing + if (line.empty()) { + continue; } - // Insert the suffix into the table - con.Query ("INSERT INTO public_suffix_list (suffix) VALUES ('" + line + "')"); + appender.BeginRow(); + appender.Append(line); // Append the processed suffix + appender.EndRow(); } + appender.Close(); // Finalize and flush the appender } }