From 8d9855f7518bcfc9fa58f1b96cd4f4af46beb735 Mon Sep 17 00:00:00 2001
From: Chunting Gu <sprinfall@gmail.com>
Date: Thu, 31 Jan 2019 18:10:12 +0800
Subject: [PATCH] Support chunked response content (but no Trailer headers).

---
 CMakeLists.txt                                |   3 +-
 example/github_rest_client/CMakeLists.txt     |  15 +
 .../main.cc                                   |   0
 example/http_ssl_client/main.cc               |  38 ++-
 example/rest_github_client/CMakeLists.txt     |  10 -
 webcc/globals.cc                              |   1 +
 webcc/globals.h                               |   1 +
 webcc/http_parser.cc                          | 276 ++++++++++++++----
 webcc/http_parser.h                           |  21 +-
 webcc/http_ssl_client.cc                      |  13 +-
 webcc/http_ssl_client.h                       |   7 +-
 11 files changed, 289 insertions(+), 96 deletions(-)
 create mode 100644 example/github_rest_client/CMakeLists.txt
 rename example/{rest_github_client => github_rest_client}/main.cc (100%)
 delete mode 100644 example/rest_github_client/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index abca1b2..0e01453 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -90,6 +90,7 @@ if(WEBCC_ENABLE_SSL)
     find_package(OpenSSL)
     if(OPENSSL_FOUND)
         include_directories(${OPENSSL_INCLUDE_DIR})
+        message(STATUS "OpenSSL libs: " ${OPENSSL_LIBRARIES})
     endif()
 endif()
 
@@ -142,7 +143,7 @@ if(WEBCC_ENABLE_EXAMPLES)
 
     if(WEBCC_ENABLE_SSL)
         add_subdirectory(example/http_ssl_client)
-        add_subdirectory(example/rest_github_client)
+        add_subdirectory(example/github_rest_client)
     endif()
 
     add_subdirectory(example/http_bin_client)
diff --git a/example/github_rest_client/CMakeLists.txt b/example/github_rest_client/CMakeLists.txt
new file mode 100644
index 0000000..de6477d
--- /dev/null
+++ b/example/github_rest_client/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(LIBS webcc jsoncpp ${Boost_LIBRARIES} "${CMAKE_THREAD_LIBS_INIT}")
+
+set(LIBS ${LIBS} ${OPENSSL_LIBRARIES})
+if(WIN32)
+	set(LIBS ${LIBS} crypt32)
+endif()
+
+if(UNIX)
+	# Add `-ldl` for Linux to avoid "undefined reference to `dlopen'".
+	set(LIBS ${LIBS} ${CMAKE_DL_LIBS})
+endif()
+
+add_executable(github_rest_client main.cc)
+
+target_link_libraries(github_rest_client ${LIBS})
diff --git a/example/rest_github_client/main.cc b/example/github_rest_client/main.cc
similarity index 100%
rename from example/rest_github_client/main.cc
rename to example/github_rest_client/main.cc
diff --git a/example/http_ssl_client/main.cc b/example/http_ssl_client/main.cc
index e662bb2..ed204a2 100644
--- a/example/http_ssl_client/main.cc
+++ b/example/http_ssl_client/main.cc
@@ -3,19 +3,37 @@
 #include "webcc/http_ssl_client.h"
 #include "webcc/logger.h"
 
-void Test() {
-  webcc::HttpRequest request;
-  request.set_method(webcc::kHttpGet);
-  request.set_url("/LICENSE_1_0.txt");
+int main(int argc, char* argv[]) {
+  std::string host;
+  std::string url;
 
-  // Leave port to default value.
-  request.set_host("www.boost.org");
+  if (argc != 3) {
+    host = "www.boost.org";
+    url = "/LICENSE_1_0.txt";
+  } else {
+    host = argv[1];
+    url = argv[2];
+  }
+
+  std::cout << "Host: " << host << std::endl;
+  std::cout << "URL:  " << url << std::endl;
+  std::cout << std::endl;
+
+  WEBCC_LOG_INIT("", webcc::LOG_CONSOLE);
 
+  webcc::HttpRequest request;
+  request.set_method(webcc::kHttpGet);
+  request.set_url(url);
+  request.set_host(host);  // Leave port to default value.
   request.Make();
 
   webcc::HttpSslClient client;
 
-  if (client.Request(request)) {
+  // Verify the certificate of the peer or not.
+  // See HttpSslClient::Request() for more details.
+  bool ssl_verify = false;
+
+  if (client.Request(request, ssl_verify)) {
     std::cout << client.response()->content() << std::endl;
   } else {
     std::cout << webcc::DescribeError(client.error());
@@ -24,12 +42,6 @@ void Test() {
     }
     std::cout << std::endl;
   }
-}
-
-int main() {
-  WEBCC_LOG_INIT("", webcc::LOG_CONSOLE);
-
-  Test();
 
   return 0;
 }
diff --git a/example/rest_github_client/CMakeLists.txt b/example/rest_github_client/CMakeLists.txt
deleted file mode 100644
index 3c9917d..0000000
--- a/example/rest_github_client/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-add_executable(rest_github_client main.cc)
-
-set(SSL_LIBS ${OPENSSL_LIBRARIES})
-if(WIN32)
-	set(SSL_LIBS ${SSL_LIBS} crypt32)
-endif()
-
-target_link_libraries(rest_github_client webcc jsoncpp ${Boost_LIBRARIES})
-target_link_libraries(rest_github_client "${CMAKE_THREAD_LIBS_INIT}")
-target_link_libraries(rest_github_client ${SSL_LIBS})
diff --git a/webcc/globals.cc b/webcc/globals.cc
index 170ce2b..60e85c1 100644
--- a/webcc/globals.cc
+++ b/webcc/globals.cc
@@ -9,6 +9,7 @@ namespace webcc {
 const std::string kHost = "Host";
 const std::string kContentType = "Content-Type";
 const std::string kContentLength = "Content-Length";
+const std::string kTransferEncoding = "Transfer-Encoding";
 const std::string kUserAgent = "User-Agent";
 
 const std::string kAppJsonUtf8 = "application/json; charset=utf-8";
diff --git a/webcc/globals.h b/webcc/globals.h
index aeeff7b..9aee7b5 100644
--- a/webcc/globals.h
+++ b/webcc/globals.h
@@ -49,6 +49,7 @@ const std::size_t kMaxDumpSize = 2048;
 extern const std::string kHost;
 extern const std::string kContentType;
 extern const std::string kContentLength;
+extern const std::string kTransferEncoding;
 extern const std::string kUserAgent;
 
 extern const std::string kAppJsonUtf8;
diff --git a/webcc/http_parser.cc b/webcc/http_parser.cc
index 2afa1e0..d6ec7d1 100644
--- a/webcc/http_parser.cc
+++ b/webcc/http_parser.cc
@@ -7,47 +7,78 @@
 
 namespace webcc {
 
+// -----------------------------------------------------------------------------
+
+static bool StringToSizeT(const std::string& str, int base,
+                          std::size_t* output) {
+  try {
+    *output = static_cast<std::size_t>(std::stoul(str, 0, base));
+  } catch (const std::exception&) {
+    return false;
+  }
+  return true;
+}
+
+// -----------------------------------------------------------------------------
+
 HttpParser::HttpParser(HttpMessage* message)
     : message_(message),
       content_length_(kInvalidLength),
       start_line_parsed_(false),
       content_length_parsed_(false),
-      header_parsed_(false),
+      header_ended_(false),
+      chunked_(false),
+      chunk_size_(kInvalidLength),
       finished_(false) {
 }
 
 bool HttpParser::Parse(const char* data, std::size_t length) {
-  if (header_parsed_) {
-    // Append the data to the content.
-    AppendContent(data, length);
+  // Append the new data to the pending data.
+  pending_data_.append(data, length);
 
-    if (IsContentFull()) {
-      // All content has been read.
-      Finish();
+  if (!header_ended_) {
+    // If headers not ended yet, continue to parse headers.
+    if (!ParseHeaders()) {
+      return false;
     }
 
+    if (header_ended_) {
+      LOG_INFO("HTTP headers just ended.");
+    }
+  }
+
+  // If headers still not ended, just return and wait for next read.
+  if (!header_ended_) {
+    LOG_INFO("HTTP headers will continue in next read.");
     return true;
   }
 
-  // Continue to parse headers.
-   
-  pending_data_.append(data, length);
+  // Now, parse the content.
+
+  if (chunked_) {
+    return ParseChunkedContent();
+  } else {
+    return ParseFixedContent();
+  }
+}
+
+bool HttpParser::ParseHeaders() {
   std::size_t off = 0;
 
   while (true) {
-    std::size_t pos = pending_data_.find(CRLF, off);
-    if (pos == std::string::npos) {
+    std::string line;
+    if (!NextPendingLine(off, &line, false)) {
+      // Can't find a full header line, need more data from next read.
       break;
     }
 
-    if (pos == off) {   // End of headers.
-      off = pos + 2;    // Skip CRLF.
-      header_parsed_ = true;
+    off = off + line.size() + 2;  // +2 for CRLF
+
+    if (line.empty()) {
+      header_ended_ = true;
       break;
     }
 
-    std::string line = pending_data_.substr(off, pos - off);
-
     if (!start_line_parsed_) {
       start_line_parsed_ = true;
       message_->set_start_line(line + CRLF);
@@ -55,84 +86,201 @@ bool HttpParser::Parse(const char* data, std::size_t length) {
         return false;
       }
     } else {
-      ParseHeader(line);
+      ParseHeaderLine(line);
     }
+  }
+
+  // Remove the parsed data.
+  pending_data_.erase(0, off);
+
+  return true;
+}
 
-    off = pos + 2;  // Skip CRLF.
+bool HttpParser::NextPendingLine(std::size_t off, std::string* line,
+                                 bool remove) {
+  std::size_t pos = pending_data_.find(CRLF, off);
+
+  if (pos == std::string::npos) {
+    return false;
   }
 
-  if (header_parsed_) {
-    // Headers just ended.
-    LOG_INFO("HTTP headers parsed.");
+  std::size_t count = pos - off;
 
-    if (!content_length_parsed_) {
-      // No Content-Length, no content.
-      Finish();
-      return true;
-    } else {
-      // Invalid Content-Length in the request.
-      if (content_length_ == kInvalidLength) {
-        return false;
-      }
-    }
+  if (pos > off) {
+    *line = pending_data_.substr(off, count);
+  }  // else: empty line
 
-    AppendContent(pending_data_.substr(off));
+  if (remove) {
+    pending_data_.erase(off, count + 2);
+  }
 
-    if (IsContentFull()) {
-      // All content has been read.
-      Finish();
-    }
-  } else {
-    // Save the unparsed piece for next parsing.
-    pending_data_ = pending_data_.substr(off);
+  return true;
+}
+
+bool HttpParser::ParseHeaderLine(const std::string& line) {
+  // NOTE: Can't split with ":" because date time also contains ":".
+  std::size_t pos = line.find(':');
+  if (pos == std::string::npos) {
+    return false;
   }
 
+  std::string name = line.substr(0, pos);
+  boost::trim(name);
+
+  std::string value = line.substr(pos + 1);
+  boost::trim(value);
+
+  do {
+    if (!chunked_ && !content_length_parsed_) {
+      if (boost::iequals(name, kContentLength)) {
+        content_length_parsed_ = true;
+
+        if (!StringToSizeT(value, 10, &content_length_)) {
+          LOG_ERRO("Invalid content length: %s.", value.c_str());
+          return false;
+        }
+
+        LOG_INFO("Content length: %u.", content_length_);
+
+        try {
+          // Reserve memory to avoid frequent reallocation when append.
+          content_.reserve(content_length_);
+        } catch (const std::exception& e) {
+          LOG_ERRO("Failed to reserve content memory: %s.", e.what());
+          return false;
+        }
+
+        break;
+      }
+    }
+    
+    // TODO: Replace `!chunked_` with <TransferEncodingParsed>.
+    if (!chunked_ && !content_length_parsed_) {
+      if (boost::iequals(name, kTransferEncoding)) {
+        if (value == "chunked") {
+          // The content is chunked.
+          chunked_ = true;
+        }
+
+        break;
+      }
+    }
+  } while (false);
+
+  // Save the header to the result message.
+  message_->SetHeader(std::move(name), std::move(value));
+
   return true;
 }
 
-bool HttpParser::ParseHeader(const std::string& line) {
-  std::vector<std::string> parts;
-  boost::split(parts, line, boost::is_any_of(":"));
+bool HttpParser::ParseFixedContent() {
+  if (!content_length_parsed_) {
+    // No Content-Length, no content.
+    Finish();
+    return true;
+  }
 
-  if (parts.size() != 2) {
+  if (content_length_ == kInvalidLength) {
+    // Invalid content length (syntax error).
+    // Normally, shouldn't be here.
     return false;
   }
 
-  std::string& name = parts[0];
-  std::string& value = parts[1];
+  // TODO: Avoid copy using std::move.
+  AppendContent(pending_data_);
 
-  boost::trim(name);
-  boost::trim(value);
+  pending_data_.clear();
 
-  if (!content_length_parsed_ && boost::iequals(name, kContentLength)) {
-    content_length_parsed_ = true;
+  if (IsContentFull()) {
+    // All content has been read.
+    Finish();
+  }
 
-    try {
-      content_length_ = static_cast<std::size_t>(std::stoul(value));
-    } catch (const std::exception&) {
-      LOG_ERRO("Invalid content length: %s.", value.c_str());
-      return false;
+  return true;
+}
+
+bool HttpParser::ParseChunkedContent() {
+  LOG_VERB("Parse chunked content (pending data size: %u).",
+           pending_data_.size());
+
+  while (true) {
+    // Read chunk-size if necessary.
+    if (chunk_size_ == kInvalidLength) {
+      if (!ParseChunkSize()) {
+        return false;
+      }
+
+      LOG_VERB("Chunk size: %u.", chunk_size_);
+    }
+
+    if (chunk_size_ == 0) {
+      Finish();
+      return true;
     }
+    
+    if (chunk_size_ + 2 <= pending_data_.size()) {  // +2 for CRLF
+      AppendContent(pending_data_.c_str(), chunk_size_);
 
-    LOG_INFO("Content length: %u.", content_length_);
+      pending_data_.erase(0, chunk_size_ + 2);
 
-    try {
-      // Reserve memory to avoid frequent reallocation when append.
-      content_.reserve(content_length_);
-    } catch (const std::exception& e) {
-      LOG_ERRO("Failed to reserve content memory: %s.", e.what());
-      return false;
+      // Reset chunk-size (NOT to 0).
+      chunk_size_ = kInvalidLength;
+
+      // Continue (explicitly) to parse next chunk.
+      continue;
+
+    } else if (chunk_size_ > pending_data_.size()) {
+      AppendContent(pending_data_);
+
+      chunk_size_ -= pending_data_.size();
+
+      pending_data_.clear();
+
+      // Wait for more data from next read.
+      break;
+
+    } else {
+      // Wait for more data from next read.
+      // if (chunk_size_ == pending_data_.size()) {
+      //   <Also wait for CRLF from next read>
+      // }
+      break;
     }
   }
 
-  message_->SetHeader(std::move(name), std::move(value));
+  return true;
+}
+
+bool HttpParser::ParseChunkSize() {
+  LOG_VERB("Parse chunk size.");
+
+  std::size_t off = 0;
+  std::string line;
+  if (!NextPendingLine(off, &line, true)) {
+    return true;
+  }
+
+  LOG_VERB("Chunk size line: [%s].", line.c_str());
+
+  std::string hex_str;  // e.g., "cf0" (3312)
+
+  std::size_t pos = line.find(' ');
+  if (pos != std::string::npos) {
+    hex_str = line.substr(0, pos);
+  } else {
+    hex_str = line;
+  }
+
+  if (!StringToSizeT(hex_str, 16, &chunk_size_)) {
+    LOG_ERRO("Invalid chunk-size: %s.", hex_str.c_str());
+    return false;
+  }
 
   return true;
 }
 
 void HttpParser::Finish() {
   if (!content_.empty()) {
-    // Move content to message.
     message_->SetContent(std::move(content_), /*set_length*/false);
   }
   finished_ = true;
diff --git a/webcc/http_parser.h b/webcc/http_parser.h
index e531dbf..bdd1524 100644
--- a/webcc/http_parser.h
+++ b/webcc/http_parser.h
@@ -26,9 +26,24 @@ class HttpParser {
   bool Parse(const char* data, std::size_t length);
 
  protected:
+  // Parse headers from pending data.
+  // Return false only on syntax errors.
+  bool ParseHeaders();
+
+  // Get next line (using delimiter CRLF) from the pending data.
+  // The line will not contain a trailing CRLF.
+  // If |remove| is true, the line, as well as the trailing CRLF, will be erased
+  // from the pending data.
+  bool NextPendingLine(std::size_t off, std::string* line, bool remove);
+
   virtual bool ParseStartLine(const std::string& line) = 0;
 
-  bool ParseHeader(const std::string& line);
+  bool ParseHeaderLine(const std::string& line);
+
+  bool ParseFixedContent();
+
+  bool ParseChunkedContent();
+  bool ParseChunkSize();
 
   void Finish();
 
@@ -48,7 +63,9 @@ class HttpParser {
   std::string content_;
   bool start_line_parsed_;
   bool content_length_parsed_;
-  bool header_parsed_;
+  bool header_ended_;
+  bool chunked_;
+  std::size_t chunk_size_;
   bool finished_;
 };
 
diff --git a/webcc/http_ssl_client.cc b/webcc/http_ssl_client.cc
index 49df839..8ae7ef9 100644
--- a/webcc/http_ssl_client.cc
+++ b/webcc/http_ssl_client.cc
@@ -34,7 +34,7 @@ void HttpSslClient::SetTimeout(int seconds) {
   }
 }
 
-bool HttpSslClient::Request(const HttpRequest& request) {
+bool HttpSslClient::Request(const HttpRequest& request, bool ssl_verify) {
   io_context_.restart();
 
   response_.reset(new HttpResponse());
@@ -48,7 +48,7 @@ bool HttpSslClient::Request(const HttpRequest& request) {
     return false;
   }
 
-  if ((error_ = Handshake(request.host())) != kNoError) {
+  if ((error_ = Handshake(request.host(), ssl_verify)) != kNoError) {
     return false;
   }
 
@@ -95,8 +95,13 @@ Error HttpSslClient::Connect(const HttpRequest& request) {
 }
 
 // NOTE: Don't check timeout. It doesn't make much sense.
-Error HttpSslClient::Handshake(const std::string& host) {
-  ssl_socket_.set_verify_mode(ssl::verify_peer);
+Error HttpSslClient::Handshake(const std::string& host, bool ssl_verify) {
+  if (ssl_verify) {
+    ssl_socket_.set_verify_mode(ssl::verify_peer);
+  } else {
+    ssl_socket_.set_verify_mode(ssl::verify_none);
+  }
+
   ssl_socket_.set_verify_callback(ssl::rfc2818_verification(host));
 
   // Use sync API directly since we don't need timeout control.
diff --git a/webcc/http_ssl_client.h b/webcc/http_ssl_client.h
index 38dc8cf..a2827d4 100644
--- a/webcc/http_ssl_client.h
+++ b/webcc/http_ssl_client.h
@@ -34,7 +34,10 @@ class HttpSslClient {
   void SetTimeout(int seconds);
 
   // Connect to server, send request, wait until response is received.
-  bool Request(const HttpRequest& request);
+  // NOTE: SSL verification (ssl_verify=true) needs CA certificates to be found
+  // in the default verify paths of OpenSSL. On Windows, it means you need to
+  // set environment variable SSL_CERT_FILE properly.
+  bool Request(const HttpRequest& request, bool ssl_verify = true);
 
   HttpResponsePtr response() const { return response_; }
 
@@ -45,7 +48,7 @@ class HttpSslClient {
  private:
   Error Connect(const HttpRequest& request);
 
-  Error Handshake(const std::string& host);
+  Error Handshake(const std::string& host, bool ssl_verify);
 
   Error SendReqeust(const HttpRequest& request);