diff --git a/README.md b/README.md index 9202a46..1214e0b 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ AI scrapers are everywhere. This will stop them. `robots.txt` won't. - Support for IP-Range based rules (both ipv4 and ipv6) - Support for async (multithreaded) request handling - Minimal. The waiting page is tiny and light on network usage. +- Support for verbose traffic logging for later inspection or statistics ### Planned features - Dynamic challenge amount (aka difficulty) based on traffic diff --git a/example/config.jsonc b/example/config.jsonc index ee5d051..db206ab 100644 --- a/example/config.jsonc +++ b/example/config.jsonc @@ -69,5 +69,17 @@ "async_proxy": true, // If enabled, specific requests that look like git HTTP(s) clones will be let through. - "git_host": false + "git_host": false, + + // Traffic logging to a .csv file + "logging": { + "log_traffic": false, + + // This is a sample schema with all supported fields + // Please keep in mind your local legal regulations, as IPs under GDPR are considered personal data. + "traffic_log_schema": "epoch,ip,domain,resource,useragent,action", + + // Where to save the logfile. Each run will continue appending to this file. It may grow HUGE! No automatic pruning / compression is done. + "traffic_log_file": "./traffic.csv" + } } \ No newline at end of file diff --git a/src/config/Config.hpp b/src/config/Config.hpp index bf36cae..41d7458 100644 --- a/src/config/Config.hpp +++ b/src/config/Config.hpp @@ -37,6 +37,12 @@ class CConfig { int default_challenge_difficulty = 4; bool async_proxy = true; std::vector proxy_rules; + + struct { + bool log_traffic = false; + std::string traffic_log_schema; + std::string traffic_log_file; + } logging; } m_config; struct { diff --git a/src/core/Handler.cpp b/src/core/Handler.cpp index 3f16d13..fc5fdf2 100644 --- a/src/core/Handler.cpp +++ b/src/core/Handler.cpp @@ -14,6 +14,8 @@ #include "../GlobalState.hpp" #include "../config/Config.hpp" #include "../helpers/FsUtils.hpp" +#include "../helpers/RequestUtils.hpp" +#include "../logging/TrafficLogger.hpp" #include #include @@ -56,73 +58,10 @@ static std::string generateToken() { return ss.str(); } -std::string CServerHandler::fingerprintForRequest(const Pistache::Http::Request& req) { - const auto HEADERS = req.headers(); - std::shared_ptr acceptEncodingHeader; - std::shared_ptr userAgentHeader; - std::shared_ptr languageHeader; - - std::string input = "checkpoint-"; - - try { - acceptEncodingHeader = Pistache::Http::Header::header_cast(HEADERS.get("Accept-Encoding")); - } catch (std::exception& e) { - ; // silent ignore - } - - try { - languageHeader = Pistache::Http::Header::header_cast(HEADERS.get("Accept-Language")); - } catch (std::exception& e) { - ; // silent ignore - } - - try { - userAgentHeader = Pistache::Http::Header::header_cast(HEADERS.get("User-Agent")); - } catch (std::exception& e) { - ; // silent ignore - } - - input += ipForRequest(req); - // TODO: those seem to change. Find better things to hash. - // if (acceptEncodingHeader) - // input += HEADERS.getRaw("Accept-Encoding").value(); - // if (languageHeader) - // input += languageHeader->language(); - if (userAgentHeader) - input += userAgentHeader->agent(); - - return g_pCrypto->sha256(input); -} - bool CServerHandler::isResourceCheckpoint(const std::string_view& res) { return res.starts_with("/checkpoint/"); } -std::string CServerHandler::ipForRequest(const Pistache::Http::Request& req) { - std::shared_ptr cfHeader; - std::shared_ptr xRealIPHeader; - - try { - cfHeader = Pistache::Http::Header::header_cast(req.headers().get("cf-connecting-ip")); - } catch (std::exception& e) { - ; // silent ignore - } - - try { - xRealIPHeader = Pistache::Http::Header::header_cast(req.headers().get("X-Real-IP")); - } catch (std::exception& e) { - ; // silent ignore - } - - if (cfHeader) - return cfHeader->ip(); - - if (xRealIPHeader) - return xRealIPHeader->ip(); - - return req.address().host(); -} - void CServerHandler::onRequest(const Pistache::Http::Request& req, Pistache::Http::ResponseWriter response) { const auto HEADERS = req.headers(); std::shared_ptr hostHeader; @@ -186,7 +125,7 @@ void CServerHandler::onRequest(const Pistache::Http::Request& req, Pistache::Htt Debug::log(LOG, "New request: {}:{}{}", hostHeader->host(), hostHeader->port().toString(), req.resource()); - const auto REQUEST_IP = ipForRequest(req); + const auto REQUEST_IP = NRequestUtils::ipForRequest(req); Debug::log(LOG, " | Request author: IP {}, direct: {}", REQUEST_IP, req.address().host()); @@ -228,12 +167,14 @@ void CServerHandler::onRequest(const Pistache::Http::Request& req, Pistache::Htt Debug::log(TRACE, "Request looks like it is coming from git (UA + GP). Accepting."); proxyPass(req, response); + g_pTrafficLogger->logTraffic(req, IP_ACTION_ALLOW); return; } else if (userAgentHeader->agent().starts_with("git/")) { Debug::log(LOG, " | Action: PASS (git)"); Debug::log(TRACE, "Request looks like it is coming from git (UA git). Accepting."); proxyPass(req, response); + g_pTrafficLogger->logTraffic(req, IP_ACTION_ALLOW); return; } } @@ -249,10 +190,12 @@ void CServerHandler::onRequest(const Pistache::Http::Request& req, Pistache::Htt case IP_ACTION_DENY: Debug::log(LOG, " | Action: DENY (rule)"); response.send(Pistache::Http::Code::Forbidden, "Blocked by checkpoint"); + g_pTrafficLogger->logTraffic(req, IP_ACTION_DENY); return; case IP_ACTION_ALLOW: Debug::log(LOG, " | Action: PASS (rule)"); proxyPass(req, response); + g_pTrafficLogger->logTraffic(req, IP_ACTION_ALLOW); return; case IP_ACTION_CHALLENGE: Debug::log(LOG, " | Action: CHALLENGE (rule)"); @@ -273,8 +216,9 @@ void CServerHandler::onRequest(const Pistache::Http::Request& req, Pistache::Htt if (TOKEN.valid()) { const auto AGE = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count() - std::chrono::duration_cast(TOKEN.issued().time_since_epoch()).count(); - if (AGE <= TOKEN_MAX_AGE_MS && TOKEN.fingerprint() == fingerprintForRequest(req)) { + if (AGE <= TOKEN_MAX_AGE_MS && TOKEN.fingerprint() == NRequestUtils::fingerprintForRequest(req)) { Debug::log(LOG, " | Action: PASS (token)"); + g_pTrafficLogger->logTraffic(req, IP_ACTION_ALLOW); proxyPass(req, response); return; } else { // token has been used from a different IP or is expired. Nuke it. @@ -329,6 +273,8 @@ void CServerHandler::onRequest(const Pistache::Http::Request& req, Pistache::Htt return; } + g_pTrafficLogger->logTraffic(req, IP_ACTION_CHALLENGE); + serveStop(req, response, challengeDifficulty); } @@ -338,7 +284,7 @@ void CServerHandler::onTimeout(const Pistache::Http::Request& request, Pistache: void CServerHandler::challengeSubmitted(const Pistache::Http::Request& req, Pistache::Http::ResponseWriter& response, bool js) { const auto JSON = req.body(); - const auto FINGERPRINT = fingerprintForRequest(req); + const auto FINGERPRINT = NRequestUtils::fingerprintForRequest(req); CChallenge CHALLENGE; if (!js) @@ -385,7 +331,7 @@ void CServerHandler::serveStop(const Pistache::Http::Request& req, Pistache::Htt page.setTemplateRoot(PAGE_ROOT); const auto NONCE = generateNonce(); - const auto CHALLENGE = CChallenge(fingerprintForRequest(req), NONCE, difficulty); + const auto CHALLENGE = CChallenge(NRequestUtils::fingerprintForRequest(req), NONCE, difficulty); auto hostDomain = req.headers().getRaw("Host").value(); if (hostDomain.contains(":")) diff --git a/src/core/Handler.hpp b/src/core/Handler.hpp index 420099e..8427871 100644 --- a/src/core/Handler.hpp +++ b/src/core/Handler.hpp @@ -22,8 +22,6 @@ class CServerHandler : public Pistache::Http::Handler { void proxyPassInternal(const Pistache::Http::Request& req, Pistache::Http::ResponseWriter& response, bool async = false); void proxyPassAsync(const Pistache::Http::Request& req, Pistache::Http::ResponseWriter& response); void challengeSubmitted(const Pistache::Http::Request& req, Pistache::Http::ResponseWriter& response, bool js); - std::string fingerprintForRequest(const Pistache::Http::Request& req); - std::string ipForRequest(const Pistache::Http::Request& req); bool isResourceCheckpoint(const std::string_view& res); diff --git a/src/helpers/RequestUtils.cpp b/src/helpers/RequestUtils.cpp new file mode 100644 index 0000000..df81af7 --- /dev/null +++ b/src/helpers/RequestUtils.cpp @@ -0,0 +1,75 @@ +#include "RequestUtils.hpp" + +#include "../core/Crypto.hpp" + +#include "../headers/authorization.hpp" +#include "../headers/cfHeader.hpp" +#include "../headers/xforwardfor.hpp" +#include "../headers/gitProtocolHeader.hpp" +#include "../headers/wwwAuthenticateHeader.hpp" +#include "../headers/acceptLanguageHeader.hpp" +#include "../headers/setCookieHeader.hpp" +#include "../headers/xrealip.hpp" + +std::string NRequestUtils::fingerprintForRequest(const Pistache::Http::Request& req) { + const auto HEADERS = req.headers(); + std::shared_ptr acceptEncodingHeader; + std::shared_ptr userAgentHeader; + std::shared_ptr languageHeader; + + std::string input = "checkpoint-"; + + try { + acceptEncodingHeader = Pistache::Http::Header::header_cast(HEADERS.get("Accept-Encoding")); + } catch (std::exception& e) { + ; // silent ignore + } + + try { + languageHeader = Pistache::Http::Header::header_cast(HEADERS.get("Accept-Language")); + } catch (std::exception& e) { + ; // silent ignore + } + + try { + userAgentHeader = Pistache::Http::Header::header_cast(HEADERS.get("User-Agent")); + } catch (std::exception& e) { + ; // silent ignore + } + + input += ipForRequest(req); + // TODO: those seem to change. Find better things to hash. + // if (acceptEncodingHeader) + // input += HEADERS.getRaw("Accept-Encoding").value(); + // if (languageHeader) + // input += languageHeader->language(); + if (userAgentHeader) + input += userAgentHeader->agent(); + + return g_pCrypto->sha256(input); +} + +std::string NRequestUtils::ipForRequest(const Pistache::Http::Request& req) { + std::shared_ptr cfHeader; + std::shared_ptr xRealIPHeader; + + try { + cfHeader = Pistache::Http::Header::header_cast(req.headers().get("cf-connecting-ip")); + } catch (std::exception& e) { + ; // silent ignore + } + + try { + xRealIPHeader = Pistache::Http::Header::header_cast(req.headers().get("X-Real-IP")); + } catch (std::exception& e) { + ; // silent ignore + } + + if (cfHeader) + return cfHeader->ip(); + + if (xRealIPHeader) + return xRealIPHeader->ip(); + + return req.address().host(); +} \ No newline at end of file diff --git a/src/helpers/RequestUtils.hpp b/src/helpers/RequestUtils.hpp new file mode 100644 index 0000000..3c52a52 --- /dev/null +++ b/src/helpers/RequestUtils.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include + +#include + +namespace NRequestUtils { + std::string fingerprintForRequest(const Pistache::Http::Request& req); + std::string ipForRequest(const Pistache::Http::Request& req); +}; \ No newline at end of file diff --git a/src/logging/TrafficLogger.cpp b/src/logging/TrafficLogger.cpp new file mode 100644 index 0000000..a7cb310 --- /dev/null +++ b/src/logging/TrafficLogger.cpp @@ -0,0 +1,143 @@ +#include "TrafficLogger.hpp" + +#include +#include + +#include "../config/Config.hpp" +#include "../debug/log.hpp" +#include "../helpers/RequestUtils.hpp" + +CTrafficLogger::CTrafficLogger() { + if (!g_pConfig->m_config.logging.log_traffic) + return; + + const auto COMMAS = std::count(g_pConfig->m_config.logging.traffic_log_schema.begin(), g_pConfig->m_config.logging.traffic_log_schema.end(), ','); + + // parse the schema + std::string_view curr; + size_t lastPos = 0; + bool first = true; + auto advance = [&]() { + size_t prev = !first ? lastPos + 1 : lastPos; + lastPos = g_pConfig->m_config.logging.traffic_log_schema.find(',', prev); + + if (lastPos == std::string::npos) + curr = std::string_view{g_pConfig->m_config.logging.traffic_log_schema}.substr(prev); + else + curr = std::string_view{g_pConfig->m_config.logging.traffic_log_schema}.substr(prev, lastPos - prev); + + first = false; + }; + + for (size_t i = 0; i < COMMAS + 1; ++i) { + advance(); + + if (curr == "ip") + m_logSchema.emplace_back(TRAFFIC_IP); + else if (curr == "epoch") + m_logSchema.emplace_back(TRAFFIC_EPOCH); + else if (curr == "domain") + m_logSchema.emplace_back(TRAFFIC_DOMAIN); + else if (curr == "resource") + m_logSchema.emplace_back(TRAFFIC_RESOURCE); + else if (curr == "useragent") + m_logSchema.emplace_back(TRAFFIC_USERAGENT); + else if (curr == "action") + m_logSchema.emplace_back(TRAFFIC_ACTION); + + if (curr == "") + break; + } + + m_file.open(g_pConfig->m_config.logging.traffic_log_file, std::ios::app); + + if (!m_file.good()) + Debug::die("TrafficLogger: bad file {}", g_pConfig->m_config.logging.traffic_log_file); +} + +CTrafficLogger::~CTrafficLogger() { + if (m_file.is_open()) + m_file.close(); +} + +static std::string sanitize(const std::string& s) { + if (s.empty()) + return s; + + std::string cpy = s; + size_t pos = 0; + while ((pos = cpy.find('"', pos)) != std::string::npos) { + cpy.replace(pos, 1, "\\\""); + pos += 2; + } + + return cpy; +} + +static const char* actionToString(eConfigIPAction a) { + switch (a) { + case IP_ACTION_CHALLENGE: return "CHALLENGE"; + case IP_ACTION_ALLOW: return "ALLOW"; + case IP_ACTION_DENY: return "DENY"; + case IP_ACTION_NONE: return "NONE"; + } + + return "ERROR"; +} + +void CTrafficLogger::logTraffic(const Pistache::Http::Request& req, eConfigIPAction actionTaken) { + if (!g_pConfig->m_config.logging.log_traffic) + return; + + std::stringstream ss; + + for (const auto& t : m_logSchema) { + switch (t) { + case TRAFFIC_EPOCH: { + ss << fmt::format("{},", std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count()); + break; + } + + case TRAFFIC_DOMAIN: { + const auto HOST = Pistache::Http::Header::header_cast(req.headers().get("Host")); + ss << fmt::format("\"{}\",", sanitize(HOST->host())); + break; + } + + case TRAFFIC_IP: { + ss << fmt::format("{},", NRequestUtils::ipForRequest(req)); + break; + } + + case TRAFFIC_RESOURCE: { + ss << fmt::format("\"{}\",", sanitize(req.resource())); + break; + } + + case TRAFFIC_USERAGENT: { + if (!req.headers().has("User-Agent")) { + ss << "\"\","; + break; + } + const auto UA = Pistache::Http::Header::header_cast(req.headers().get("User-Agent")); + ss << fmt::format("\"{}\",", sanitize(UA->agent())); + break; + } + + case TRAFFIC_ACTION: { + ss << fmt::format("{},", actionToString(actionTaken)); + break; + } + } + } + + std::string trafficLine = ss.str(); + if (trafficLine.empty()) + return; + + // replace , with \n + trafficLine.back() = '\n'; + + m_file << trafficLine; + m_file.flush(); +} \ No newline at end of file diff --git a/src/logging/TrafficLogger.hpp b/src/logging/TrafficLogger.hpp new file mode 100644 index 0000000..bd47546 --- /dev/null +++ b/src/logging/TrafficLogger.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include +#include + +#include "../config/ConfigTypes.hpp" + +#include + +class CTrafficLogger { + public: + CTrafficLogger(); + ~CTrafficLogger(); + + void logTraffic(const Pistache::Http::Request& req, eConfigIPAction actionTaken); + + private: + enum eTrafficLoggerProps : uint8_t { + TRAFFIC_EPOCH = 0, + TRAFFIC_IP, + TRAFFIC_DOMAIN, + TRAFFIC_RESOURCE, + TRAFFIC_USERAGENT, + TRAFFIC_ACTION, + }; + + std::vector m_logSchema; + std::ofstream m_file; +}; + +inline std::unique_ptr g_pTrafficLogger; \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index b71703d..9cfaf95 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -25,6 +25,8 @@ #include "config/Config.hpp" +#include "logging/TrafficLogger.hpp" + #include "GlobalState.hpp" #include @@ -87,7 +89,8 @@ int main(int argc, char** argv, char** envp) { Pistache::Http::Header::Registry::instance().registerHeader(); Pistache::Http::Header::Registry::instance().registerHeader(); - g_pCrypto = std::make_unique(); + g_pCrypto = std::make_unique(); + g_pTrafficLogger = std::make_unique(); auto endpoint = std::make_unique(address); auto opts = Pistache::Http::Endpoint::options().threads(threads).flags(Pistache::Tcp::Options::ReuseAddr | Pistache::Tcp::Options::ReusePort);