diff --git a/README.md b/README.md index ba26b47c..f50b5e0b 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,20 @@ TABLES GoodBye! ``` + +# Compressed Data + +Consider using the new "extra_headers" optional parameter to send +"Accept-Encoding: gzip" and have Druid return the results compressed, +increasing the performance of the query especially for large data sets. + +```python +from pydruid.client import PyDruid + +query = PyDruid(druid_url_goes_here, 'druid/v2', extra_headers={"Accept-Encoding": "gzip"}) +``` + + # Contributing Contributions are welcomed of course. We like to use `black` and `flake8`. diff --git a/pydruid/client.py b/pydruid/client.py index 95dca1dd..16e41eb6 100755 --- a/pydruid/client.py +++ b/pydruid/client.py @@ -16,6 +16,7 @@ from __future__ import division from __future__ import absolute_import +import sys import json import re @@ -24,19 +25,35 @@ from pydruid.query import QueryBuilder from base64 import b64encode +if sys.version_info.major == 2 and sys.version_info.minor == 7: + import StringIO + from gzip import GzipFile + + def decompress(data): + infile = StringIO.StringIO() + infile.write(data) + with GzipFile(fileobj=infile, mode="r") as f: + f.rewind() + ud = f.read() + return ud + + +else: + from gzip import decompress # extract error from the
tag inside the HTML response
HTML_ERROR = re.compile("\\s*(.*?)\\s*
", re.IGNORECASE)
class BaseDruidClient(object):
- def __init__(self, url, endpoint):
+ def __init__(self, url, endpoint, extra_headers=None):
self.url = url
self.endpoint = endpoint
self.query_builder = QueryBuilder()
self.username = None
self.password = None
self.proxies = None
+ self.extra_headers = extra_headers
def set_basic_auth_credentials(self, username, password):
self.username = username
@@ -55,6 +72,8 @@ def _prepare_url_headers_and_body(self, query):
else:
url = self.url + "/" + self.endpoint
headers = {"Content-Type": "application/json"}
+ if self.extra_headers and isinstance(self.extra_headers, dict):
+ headers.update(self.extra_headers)
if (self.username is not None) and (self.password is not None):
authstring = "{}:{}".format(self.username, self.password)
b64string = b64encode(authstring.encode()).decode()
@@ -542,15 +561,23 @@ class PyDruid(BaseDruidClient):
1 6 2013-10-04T00:00:00.000Z user_2
"""
- def __init__(self, url, endpoint):
- super(PyDruid, self).__init__(url, endpoint)
+ def __init__(self, url, endpoint, extra_headers=None):
+ super(PyDruid, self).__init__(url, endpoint, extra_headers)
def _post(self, query):
try:
headers, querystr, url = self._prepare_url_headers_and_body(query)
req = urllib.request.Request(url, querystr, headers)
res = urllib.request.urlopen(req)
- data = res.read().decode("utf-8")
+ content_encoding = res.info().get("Content-Encoding")
+ if content_encoding == "gzip":
+ data = decompress(res.read()).decode("utf-8")
+ elif content_encoding:
+ raise ValueError(
+ "Invalid content encoding: {}".format(content_encoding)
+ )
+ else:
+ data = res.read().decode("utf-8")
res.close()
except urllib.error.HTTPError as e:
err = e.read()
diff --git a/tests/test_client.py b/tests/test_client.py
index 8f2d0a1b..161eeebc 100644
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -1,6 +1,6 @@
# -*- coding: UTF-8 -*-
import textwrap
-
+import sys
import pytest
from mock import patch, Mock
from six.moves import urllib
@@ -11,29 +11,38 @@
from pydruid.utils.aggregators import doublesum
from pydruid.utils.filters import Dimension
+if sys.version_info.major == 2 and sys.version_info.minor == 7:
+ from gzip import GzipFile
+
+ def compress(data):
+ out = StringIO()
+ with GzipFile(fileobj=out, mode="w") as f:
+ f.write(data)
+ return out.getvalue()
+
+
+else:
+ from gzip import compress
-def create_client():
- return PyDruid("http://localhost:8083", "druid/v2/")
+
+def create_client(headers=None):
+ return PyDruid("http://localhost:8083", "druid/v2/", headers)
def create_blank_query():
- return Query({}, 'none')
+ return Query({}, "none")
-def _http_error(code, msg, data = ''):
+def _http_error(code, msg, data=""):
# Need a file-like object for the response data
fp = StringIO(data)
return urllib.error.HTTPError(
- url='http://fakeurl:8080/druid/v2/',
- hdrs={},
- code=code,
- msg=msg,
- fp=fp,
+ url="http://fakeurl:8080/druid/v2/", hdrs={}, code=code, msg=msg, fp=fp
)
class TestPyDruid:
- @patch('pydruid.client.urllib.request.urlopen')
+ @patch("pydruid.client.urllib.request.urlopen")
def test_druid_returns_error(self, mock_urlopen):
# given
mock_urlopen.side_effect = _http_error(500, "Druid error")
@@ -42,20 +51,22 @@ def test_druid_returns_error(self, mock_urlopen):
# when / then
with pytest.raises(IOError):
client.topn(
- datasource="testdatasource",
- granularity="all",
- intervals="2015-12-29/pt1h",
- aggregations={"count": doublesum("count")},
- dimension="user_name",
- metric="count",
- filter=Dimension("user_lang") == "en",
- threshold=1,
- context={"timeout": 1000})
-
- @patch('pydruid.client.urllib.request.urlopen')
+ datasource="testdatasource",
+ granularity="all",
+ intervals="2015-12-29/pt1h",
+ aggregations={"count": doublesum("count")},
+ dimension="user_name",
+ metric="count",
+ filter=Dimension("user_lang") == "en",
+ threshold=1,
+ context={"timeout": 1000},
+ )
+
+ @patch("pydruid.client.urllib.request.urlopen")
def test_druid_returns_html_error(self, mock_urlopen):
# given
- message = textwrap.dedent("""
+ message = textwrap.dedent(
+ """
@@ -68,24 +79,29 @@ def test_druid_returns_html_error(self, mock_urlopen):
Powered by Jetty:// 9.3.19.v20170502
- """).strip()
- mock_urlopen.side_effect = _http_error(500, 'Internal Server Error', message)
+ """
+ ).strip()
+ mock_urlopen.side_effect = _http_error(500, "Internal Server Error", message)
client = create_client()
# when / then
with pytest.raises(IOError) as e:
client.topn(
- datasource="testdatasource",
- granularity="all",
- intervals="2015-12-29/pt1h",
- aggregations={"count": doublesum("count")},
- dimension="user_name",
- metric="count",
- filter=Dimension("user_lang") == "en",
- threshold=1,
- context={"timeout": 1000})
-
- assert str(e.value) == textwrap.dedent("""
+ datasource="testdatasource",
+ granularity="all",
+ intervals="2015-12-29/pt1h",
+ aggregations={"count": doublesum("count")},
+ dimension="user_name",
+ metric="count",
+ filter=Dimension("user_lang") == "en",
+ threshold=1,
+ context={"timeout": 1000},
+ )
+
+ assert (
+ str(e.value)
+ == textwrap.dedent(
+ """
HTTP Error 500: Internal Server Error
Druid Error: javax.servlet.ServletException: java.lang.OutOfMemoryError: GC overhead limit exceeded
Query is: {
@@ -112,9 +128,11 @@ def test_druid_returns_html_error(self, mock_urlopen):
"queryType": "topN",
"threshold": 1
}
- """).strip()
+ """
+ ).strip()
+ )
- @patch('pydruid.client.urllib.request.urlopen')
+ @patch("pydruid.client.urllib.request.urlopen")
def test_druid_returns_results(self, mock_urlopen):
# given
response = Mock()
@@ -126,28 +144,32 @@ def test_druid_returns_results(self, mock_urlopen):
"metric" : 100
} ]
} ]
- """.encode("utf-8")
+ """.encode(
+ "utf-8"
+ )
+ response.info.return_value = {}
mock_urlopen.return_value = response
client = create_client()
# when
top = client.topn(
- datasource="testdatasource",
- granularity="all",
- intervals="2015-12-29/pt1h",
- aggregations={"count": doublesum("count")},
- dimension="user_name",
- metric="count",
- filter=Dimension("user_lang") == "en",
- threshold=1,
- context={"timeout": 1000})
+ datasource="testdatasource",
+ granularity="all",
+ intervals="2015-12-29/pt1h",
+ aggregations={"count": doublesum("count")},
+ dimension="user_name",
+ metric="count",
+ filter=Dimension("user_lang") == "en",
+ threshold=1,
+ context={"timeout": 1000},
+ )
# then
assert top is not None
assert len(top.result) == 1
- assert len(top.result[0]['result']) == 1
+ assert len(top.result[0]["result"]) == 1
- @patch('pydruid.client.urllib.request.urlopen')
+ @patch("pydruid.client.urllib.request.urlopen")
def test_client_allows_to_export_last_query(self, mock_urlopen):
# given
response = Mock()
@@ -159,29 +181,79 @@ def test_client_allows_to_export_last_query(self, mock_urlopen):
"metric" : 100
} ]
} ]
- """.encode("utf-8")
+ """.encode(
+ "utf-8"
+ )
+ response.info.return_value = {}
mock_urlopen.return_value = response
client = create_client()
client.topn(
- datasource="testdatasource",
- granularity="all",
- intervals="2015-12-29/pt1h",
- aggregations={"count": doublesum("count")},
- dimension="user_name",
- metric="count",
- filter=Dimension("user_lang") == "en",
- threshold=1,
- context={"timeout": 1000})
+ datasource="testdatasource",
+ granularity="all",
+ intervals="2015-12-29/pt1h",
+ aggregations={"count": doublesum("count")},
+ dimension="user_name",
+ metric="count",
+ filter=Dimension("user_lang") == "en",
+ threshold=1,
+ context={"timeout": 1000},
+ )
# when / then
- # assert that last_query.export_tsv method was called (it should throw an exception, given empty path)
+ # assert that last_query.export_tsv method was called (it should throw an
+ # exception, given empty path)
with pytest.raises(TypeError):
client.export_tsv(None)
- @patch('pydruid.client.urllib.request.urlopen')
+ @patch("pydruid.client.urllib.request.urlopen")
def test_client_auth_creds(self, mock_urlopen):
client = create_client()
query = create_blank_query()
- client.set_basic_auth_credentials('myUsername', 'myPassword')
+ client.set_basic_auth_credentials("myUsername", "myPassword")
+ headers, _, _ = client._prepare_url_headers_and_body(query)
+ assert headers["Authorization"] == "Basic bXlVc2VybmFtZTpteVBhc3N3b3Jk"
+
+ def test_client_allows_extra_headers(self):
+ client = create_client(headers={"Accept-Encoding": "gzip"})
+ query = create_blank_query()
headers, _, _ = client._prepare_url_headers_and_body(query)
- assert headers['Authorization'] == "Basic bXlVc2VybmFtZTpteVBhc3N3b3Jk"
+ assert headers["Accept-Encoding"] == "gzip"
+
+ @patch("pydruid.client.urllib.request.urlopen")
+ def test_return_compressed_data(self, mock_urlopen):
+ # given
+ response = Mock()
+ response.read.return_value = compress(
+ """
+ [ {
+ "timestamp" : "2015-12-30T14:14:49.000Z",
+ "result" : [ {
+ "dimension" : "aaaa",
+ "metric" : 100
+ } ]
+ } ]
+ """.encode(
+ "utf-8"
+ )
+ )
+ response.info.return_value = {"Content-Encoding": "gzip"}
+ mock_urlopen.return_value = response
+ client = create_client(headers={"Accept-Encoding": "gzip"})
+
+ # when
+ top = client.topn(
+ datasource="testdatasource",
+ granularity="all",
+ intervals="2015-12-29/pt1h",
+ aggregations={"count": doublesum("count")},
+ dimension="user_name",
+ metric="count",
+ filter=Dimension("user_lang") == "en",
+ threshold=1,
+ context={"timeout": 1000},
+ )
+
+ # then
+ assert top is not None
+ assert len(top.result) == 1
+ assert len(top.result[0]["result"]) == 1