Introduced new chunked transfer encoding parser to remove chunk marke…

…rs in streaming clients.
cpp-netlib · deanberris · Mar 31, 2017 · Dec 14, 2016 · Dec 14, 2016 · Dec 15, 2016
commit 673b85301ed5a090c6374c29c5450de3432e0432
diff --git a/boost/network/protocol/http/client/async_impl.hpp b/boost/network/protocol/http/client/async_impl.hpp
@@ -39,14 +39,15 @@ struct async_client
 
   async_client(bool cache_resolved, bool follow_redirect,
                bool always_verify_peer, int timeout,
+               bool remove_chunk_markers,
                std::shared_ptr<boost::asio::io_service> service,
                optional<string_type> certificate_filename,
                optional<string_type> verify_path,
                optional<string_type> certificate_file,
                optional<string_type> private_key_file,
                optional<string_type> ciphers,
                optional<string_type> sni_hostname, long ssl_options)
-      : connection_base(cache_resolved, follow_redirect, timeout),
+      : connection_base(cache_resolved, follow_redirect, timeout, remove_chunk_markers),
         service_ptr(service.get() ? service
                                   : std::make_shared<boost::asio::io_service>()),
         service_(*service_ptr),

diff --git a/boost/network/protocol/http/client/connection/async_base.hpp b/boost/network/protocol/http/client/connection/async_base.hpp
@@ -43,7 +43,7 @@ struct async_connection_base {
   // tag.
   static connection_ptr new_connection(
       resolve_function resolve, resolver_type &resolver, bool follow_redirect,
-      bool always_verify_peer, bool https, int timeout,
+      bool always_verify_peer, bool https, int timeout, bool remove_chunk_markers,
       optional<string_type> certificate_filename = optional<string_type>(),
       optional<string_type> const &verify_path = optional<string_type>(),
       optional<string_type> certificate_file = optional<string_type>(),
@@ -59,7 +59,7 @@ struct async_connection_base {
         certificate_filename, verify_path, certificate_file, private_key_file,
         ciphers, sni_hostname, ssl_options);
     auto temp = std::make_shared<async_connection>(
-        resolver, resolve, follow_redirect, timeout, std::move(delegate));
+        resolver, resolve, follow_redirect, timeout, remove_chunk_markers, std::move(delegate));
     BOOST_ASSERT(temp != nullptr);
     return temp;
   }

diff --git a/boost/network/protocol/http/client/connection/async_normal.hpp b/boost/network/protocol/http/client/connection/async_normal.hpp
@@ -11,6 +11,7 @@
 
 #include <iterator>
 #include <cstdint>
+#include <iostream>
 #include <boost/algorithm/string/trim.hpp>
 #include <boost/asio/steady_timer.hpp>
 #include <boost/asio/placeholders.hpp>
@@ -37,6 +38,77 @@ namespace network {
 namespace http {
 namespace impl {
 
+template <class Tag>
+struct chunk_encoding_parser {
+
+  chunk_encoding_parser() : state(state_t::header), chunk_size(0) {}
+
+  enum state_t { header, header_end, data, data_end };
+
+  state_t state;
+  size_t chunk_size;
+  std::array<typename char_<Tag>::type, 1024> buffer;
+
+  void update_chunk_size(boost::iterator_range<typename std::array<typename char_<Tag>::type, 1024>::const_iterator> const& range) {
+    if (range.empty())
+        return;
+    std::stringstream ss;
+    ss << std::hex << range;
+    size_t size;
+    ss >> size;
+    chunk_size = (chunk_size << (range.size()*4)) + size;
+  }
+
+  boost::iterator_range<typename std::array<typename char_<Tag>::type, 1024>::const_iterator> operator()(boost::iterator_range<typename std::array<typename char_<Tag>::type, 1024>::const_iterator> const& range) {
+    auto iter = boost::begin(range);
+    auto begin = iter;
+    auto pos = boost::begin(buffer);
+
+    while (iter != boost::end(range))
+      switch(state) {
+        case state_t::header:
+          iter = std::find(iter, boost::end(range), '\r');
+          update_chunk_size(boost::make_iterator_range(begin, iter));
+          if (iter != boost::end(range)) {
+            state = state_t::header_end;
+            ++iter;
+          }
+          break;
+
+        case state_t::header_end:
+          BOOST_ASSERT(*iter == '\n');
+          ++iter;
+          state = state_t::data;
+          break;
+
+        case state_t::data:
+          if (chunk_size == 0) {
+            BOOST_ASSERT(*iter == '\r');
+            ++iter;
+            state = state_t::data_end;
+          } else {
+            auto len = std::min(chunk_size, (size_t)std::distance(iter, boost::end(range)));
+            begin = iter;
+            iter = std::next(iter, len);
+            pos = std::copy(begin, iter, pos);
+            chunk_size -= len;
+          }
+          break;
+
+        case state_t::data_end:
+          BOOST_ASSERT (*iter == '\n');
+          ++iter;
+          begin = iter;
+          state = state_t::header;
+          break;
+
+        default:
+          BOOST_ASSERT(false && "Bug, report this to the developers!");
+      }
+    return boost::make_iterator_range(boost::begin(buffer), pos);
+  }
+};
+
 template <class Tag, unsigned version_major, unsigned version_minor>
 struct async_connection_base;
 
@@ -72,8 +144,10 @@ struct http_async_connection
 
   http_async_connection(resolver_type& resolver, resolve_function resolve,
                         bool follow_redirect, int timeout,
+                        bool remove_chunk_markers,
                         connection_delegate_ptr delegate)
       : timeout_(timeout),
+        remove_chunk_markers_(remove_chunk_markers),
         timer_(resolver.get_io_service()),
         is_timedout_(false),
         follow_redirect_(follow_redirect),
@@ -348,8 +422,11 @@ struct http_async_connection
 
             // The invocation of the callback is synchronous to allow us to
             // wait before scheduling another read.
-            callback(make_iterator_range(begin, end), ec);
-
+            if (this->is_chunk_encoding && remove_chunk_markers_) {
+              callback(parse_chunk_encoding(make_iterator_range(begin, end)), ec);
+            } else {
+              callback(make_iterator_range(begin, end), ec);
+            }
             auto self = this->shared_from_this();
             delegate_->read_some(
                 boost::asio::mutable_buffers_1(this->part.data(),
@@ -388,14 +465,28 @@ struct http_async_connection
               // We call the callback function synchronously passing the error
               // condition (in this case, end of file) so that it can handle it
               // appropriately.
-              callback(make_iterator_range(begin, end), ec);
+              if (this->is_chunk_encoding && remove_chunk_markers_) {
+                callback(parse_chunk_encoding(make_iterator_range(begin, end)), ec);
+              } else {
+                callback(make_iterator_range(begin, end), ec);
+              }
             } else {
               string_type body_string;
-              std::swap(body_string, this->partial_parsed);
-              body_string.append(this->part.begin(), this->part.begin() + bytes_transferred);
-              if (this->is_chunk_encoding) {
-                this->body_promise.set_value(parse_chunk_encoding(body_string));
+              if (this->is_chunk_encoding && remove_chunk_markers_) {
+                for (size_t i = 0; i < this->partial_parsed.size(); i += 1024) {
+                  auto range = parse_chunk_encoding(
+                    boost::make_iterator_range(this->partial_parsed.data() + i, 
+                                               this->partial_parsed.data() + std::min(i+1024, this->partial_parsed.size())));
+                  body_string.append(boost::begin(range), boost::end(range));
+                }
+                this->partial_parsed.clear();
+                auto range = parse_chunk_encoding(boost::make_iterator_range(this->part.begin(), 
+                                                                             this->part.begin() + bytes_transferred));
+                body_string.append(boost::begin(range), boost::end(range));
+                this->body_promise.set_value(body_string);
               } else {
+                std::swap(body_string, this->partial_parsed);
+                body_string.append(this->part.begin(), this->part.begin() + bytes_transferred);
                 this->body_promise.set_value(body_string);
               }
             }
@@ -417,7 +508,11 @@ struct http_async_connection
                   this->part.begin();
               typename protocol_base::buffer_type::const_iterator end = begin;
               std::advance(end, bytes_transferred);
-              callback(make_iterator_range(begin, end), ec);
+              if (this->is_chunk_encoding && remove_chunk_markers_) {
+                callback(parse_chunk_encoding(make_iterator_range(begin, end)), ec);
+              } else {
+                callback(make_iterator_range(begin, end), ec);
+              }
               auto self = this->shared_from_this();
               delegate_->read_some(
                   boost::asio::mutable_buffers_1(this->part.data(),
@@ -476,38 +571,39 @@ struct http_async_connection
     }
   }
 
-  string_type parse_chunk_encoding(string_type& body_string) {
-    string_type body;
-    string_type crlf = "\r\n";
-
-    typename string_type::iterator begin = body_string.begin();
-    for (typename string_type::iterator iter =
-             std::search(begin, body_string.end(), crlf.begin(), crlf.end());
-         iter != body_string.end();
-         iter =
-             std::search(begin, body_string.end(), crlf.begin(), crlf.end())) {
-      string_type line(begin, iter);
-      if (line.empty()) {
-        break;
-      }
-      std::stringstream stream(line);
-      int len;
-      stream >> std::hex >> len;
-      std::advance(iter, 2);
-      if (len == 0) {
-        break;
-      }
-      if (len <= body_string.end() - iter) {
-        body.insert(body.end(), iter, iter + len);
-        std::advance(iter, len + 2);
-      }
-      begin = iter;
-    }
+  // string_type parse_chunk_encoding(string_type& body_string) {
+  //   string_type body;
+  //   string_type crlf = "\r\n";
 
-    return body;
-  }
+  //   typename string_type::iterator begin = body_string.begin();
+  //   for (typename string_type::iterator iter =
+  //            std::search(begin, body_string.end(), crlf.begin(), crlf.end());
+  //        iter != body_string.end();
+  //        iter =
+  //            std::search(begin, body_string.end(), crlf.begin(), crlf.end())) {
+  //     string_type line(begin, iter);
+  //     if (line.empty()) {
+  //       break;
+  //     }
+  //     std::stringstream stream(line);
+  //     int len;
+  //     stream >> std::hex >> len;
+  //     std::advance(iter, 2);
+  //     if (len == 0) {
+  //       break;
+  //     }
+  //     if (len <= body_string.end() - iter) {
+  //       body.insert(body.end(), iter, iter + len);
+  //       std::advance(iter, len + 2);
+  //     }
+  //     begin = iter;
+  //   }
+
+  //   return body;
+  // }
 
   int timeout_;
+  bool remove_chunk_markers_;
   boost::asio::steady_timer timer_;
   bool is_timedout_;
   bool follow_redirect_;
@@ -517,6 +613,7 @@ struct http_async_connection
   connection_delegate_ptr delegate_;
   boost::asio::streambuf command_streambuf;
   string_type method;
+  chunk_encoding_parser<Tag> parse_chunk_encoding;
 };
 
 }  // namespace impl

diff --git a/boost/network/protocol/http/client/facade.hpp b/boost/network/protocol/http/client/facade.hpp
@@ -303,7 +303,8 @@ class basic_client_facade {
         options.openssl_verify_path(), options.openssl_certificate_file(),
         options.openssl_private_key_file(), options.openssl_ciphers(),
         options.openssl_sni_hostname(), options.openssl_options(),
-        options.io_service(), options.timeout()));
+        options.io_service(), options.timeout(),
+        options.remove_chunk_markers()));
   }
 };
 

diff --git a/boost/network/protocol/http/client/options.hpp b/boost/network/protocol/http/client/options.hpp
@@ -34,7 +34,8 @@ class client_options {
         openssl_options_(0),
         io_service_(),
         always_verify_peer_(true),
-        timeout_(0) {}
+        timeout_(0),
+        remove_chunk_markers_(false) {}
 
   client_options(client_options const& other)
       : cache_resolved_(other.cache_resolved_),
@@ -48,7 +49,8 @@ class client_options {
         openssl_options_(other.openssl_options_),
         io_service_(other.io_service_),
         always_verify_peer_(other.always_verify_peer_),
-        timeout_(other.timeout_) {}
+        timeout_(other.timeout_),
+        remove_chunk_markers_(other.remove_chunk_markers) {}
 
   client_options& operator=(client_options other) {
     other.swap(*this);
@@ -69,6 +71,7 @@ class client_options {
     swap(io_service_, other.io_service_);
     swap(always_verify_peer_, other.always_verify_peer_);
     swap(timeout_, other.timeout_);
+    swap(remove_chunk_markers_, other.remove_chunk_markers_);
   }
 
   /// Specify whether the client should cache resolved endpoints.
@@ -154,6 +157,12 @@ class client_options {
     return *this;
   }
 
+  /// Set an overall timeout for HTTP requests.
+  client_options& remove_chunk_markers(bool v) {
+    remove_chunk_markers_ = v;
+    return *this;
+  }
+
   bool cache_resolved() const { return cache_resolved_; }
 
   bool follow_redirects() const { return follow_redirects_; }
@@ -190,6 +199,8 @@ class client_options {
 
   int timeout() const { return timeout_; }
 
+  bool remove_chunk_markers() const { return remove_chunk_markers_; }
+
  private:
   bool cache_resolved_;
   bool follow_redirects_;
@@ -203,6 +214,7 @@ class client_options {
   std::shared_ptr<boost::asio::io_service> io_service_;
   bool always_verify_peer_;
   int timeout_;
+  bool remove_chunk_markers_;
 };
 
 template <class Tag>

diff --git a/boost/network/protocol/http/client/pimpl.hpp b/boost/network/protocol/http/client/pimpl.hpp
@@ -74,10 +74,12 @@ struct basic_client_impl
                     optional<string_type> const& private_key_file,
                     optional<string_type> const& ciphers,
                     optional<string_type> const& sni_hostname, long ssl_options,
-                    std::shared_ptr<boost::asio::io_service> service, int timeout)
-      : base_type(cache_resolved, follow_redirect, always_verify_peer, timeout,
-                  service, certificate_filename, verify_path, certificate_file,
-                  private_key_file, ciphers, sni_hostname, ssl_options) {}
+                    std::shared_ptr<boost::asio::io_service> service, int timeout,
+                    bool remove_chunk_markers)
+      : base_type(cache_resolved, follow_redirect, always_verify_peer, timeout, 
+                  remove_chunk_markers, service, certificate_filename, verify_path,
+                  certificate_file, private_key_file, ciphers, sni_hostname,
+                  ssl_options) {}
 
   ~basic_client_impl() = default;
 };