Skip to content

Commit 71a87e1

Browse files
authored
Merge pull request #807 from desmondcheongzx/update-daft-results
Update Daft results for v0.7.4
2 parents c33e5e8 + 8ced5cc commit 71a87e1

File tree

7 files changed

+115
-278
lines changed

7 files changed

+115
-278
lines changed

daft-parquet-partitioned/benchmark.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ python3 -m venv myenv
77
source myenv/bin/activate
88
pip install pandas
99
pip install packaging
10-
pip install daft==0.4.13
10+
pip install daft==0.7.4
1111

1212
seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
1313

daft-parquet-partitioned/queries.sql

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
3636
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
3737
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
3838
SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
39-
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1010;
40-
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1010;
41-
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 110;
42-
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10010;
43-
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 1010;
39+
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;

daft-parquet-partitioned/query.py

Lines changed: 7 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55
import sys
66
import timeit
77
import traceback
8-
import pandas as pd
9-
from daft import col, DataType, TimeUnit
8+
from daft import col, DataType
109

1110
hits = None
1211
current_dir = os.path.dirname(os.path.abspath(__file__))
@@ -20,115 +19,35 @@
2019
with open("queries.sql") as f:
2120
sql_list = [q.strip() for q in f.read().split(';') if q.strip()]
2221

23-
def daft_offset(df, start ,end):
24-
pandas_df = df.to_pandas()
25-
sliced_df = pandas_df.iloc[start:end]
26-
return sliced_df
27-
28-
queries = []
29-
for idx, sql in enumerate(sql_list):
30-
query_entry = {"sql": sql}
31-
32-
# Current limitations and workarounds for Daft execution:
33-
34-
# 1. Queries q18, q35, q42 require manual API workarounds:
35-
# - q18: The function `extract(minute FROM EventTime)` causes an error:
36-
# `expected input to minute to be temporal, got UInt32`.
37-
# - q35: Error is `duplicate field name ClientIP in the schema`.
38-
# Attempts to alias the column in SQL but still failed.
39-
# - q42: The function `DATE_TRUNC('minute', EventTime)` causes an error:
40-
# `Unsupported SQL: Function date_trunc not found`.
41-
if idx in [18, 35, 42]:
42-
if idx == 18:
43-
query_entry["lambda"] = lambda: (
44-
hits.with_column("m", col("EventTime").dt.minute())
45-
.groupby("UserID", "m", "SearchPhrase")
46-
.agg(daft.sql_expr("COUNT(1)").alias("COUNT(*)"))
47-
.sort("COUNT(*)", desc=True)
48-
.limit(10)
49-
.select("UserID", "m", "SearchPhrase", "COUNT(*)")
50-
)
51-
elif idx == 35:
52-
query_entry["lambda"] = lambda: (
53-
hits.groupby(
54-
"ClientIP",
55-
daft.sql_expr("ClientIP - 1").alias("ClientIP - 1"),
56-
daft.sql_expr("ClientIP - 2").alias("ClientIP - 2"),
57-
daft.sql_expr("ClientIP - 3").alias("ClientIP - 3"))
58-
.agg(daft.sql_expr("COUNT(1)").alias("c"))
59-
.sort("c", desc=True)
60-
.limit(10)
61-
.select("ClientIP", "ClientIP - 1", "ClientIP - 2", "ClientIP - 3", "c")
62-
)
63-
elif idx == 42:
64-
query_entry["lambda"] = lambda: (
65-
hits.with_column("M", col("EventTime").dt.truncate("1 minute"))
66-
.where("CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0")
67-
.groupby("M")
68-
.agg(daft.sql_expr("COUNT(1)").alias("PageViews"))
69-
.sort("M", desc=False)
70-
.limit(1010)
71-
.select("M", "PageViews")
72-
)
73-
74-
# 2. OFFSET operator not supported in Daft:
75-
# For queries q38, q39, q40, q41, q42, after executing the query,
76-
# manually implement the `OFFSET` truncation logic via the API
77-
if 38 <= idx <= 42:
78-
if idx == 38:
79-
query_entry["extra_api"] = lambda df: daft_offset(df, 1000, 1010)
80-
elif idx == 39:
81-
query_entry["extra_api"] = lambda df: daft_offset(df, 1000, 1010)
82-
elif idx == 40:
83-
query_entry["extra_api"] = lambda df: daft_offset(df, 100, 110)
84-
elif idx == 41:
85-
query_entry["extra_api"] = lambda df: daft_offset(df, 10000, 10010)
86-
elif idx == 42:
87-
query_entry["extra_api"] = lambda df: daft_offset(df, 1000, 1010)
88-
89-
queries.append(query_entry)
90-
91-
def run_single_query(query, i):
22+
def run_single_query(sql, i):
9223
try:
9324
start = timeit.default_timer()
9425

9526
global hits
9627
if hits is None:
9728
hits = daft.read_parquet(parquet_path)
98-
hits = hits.with_column("EventTime", col("EventTime").cast(daft.DataType.timestamp("s")))
99-
hits = hits.with_column("EventDate", col("EventDate").cast(daft.DataType.date()))
29+
hits = hits.with_column("EventTime", col("EventTime").cast(DataType.timestamp("s")))
30+
hits = hits.with_column("EventDate", col("EventDate").cast(DataType.date()))
10031
hits = hits.with_column("URL", col("URL").decode("utf-8"))
10132
hits = hits.with_column("Title", col("Title").decode("utf-8"))
10233
hits = hits.with_column("Referer", col("Referer").decode("utf-8"))
10334
hits = hits.with_column("MobilePhoneModel", col("MobilePhoneModel").decode("utf-8"))
10435
hits = hits.with_column("SearchPhrase", col("SearchPhrase").decode("utf-8"))
10536

106-
result = None
107-
108-
if "lambda" in query:
109-
result = query["lambda"]()
110-
else:
111-
result = daft.sql(query["sql"])
112-
37+
result = daft.sql(sql)
11338
result.collect()
11439

115-
if "extra_api" in query:
116-
result = query["extra_api"](result)
117-
11840
run_time = round(timeit.default_timer() - start, 3)
119-
12041
return run_time
12142
except Exception as e:
12243
print(f"Error executing query {query_idx}: {str(e)[:100]}", file=sys.stderr)
12344
traceback.print_exc()
12445
return None
12546

12647
if __name__ == "__main__":
127-
query = queries[query_idx]
128-
48+
sql = sql_list[query_idx]
12949
times = []
13050
for i in range(3):
131-
elapsed = run_single_query(query, i)
51+
elapsed = run_single_query(sql, i)
13252
times.append(f"{elapsed}" if elapsed else "")
133-
13453
print(','.join(times))
Lines changed: 45 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"system": "Daft (Parquet, partitioned)",
3-
"date": "2025-08-31",
3+
"date": "2026-03-30",
44
"machine": "c6a.4xlarge",
55
"cluster_size": 1,
66
"proprietary": "no",
@@ -10,49 +10,49 @@
1010
"load_time": 0,
1111
"data_size": 14737666736,
1212
"result": [
13-
[3.405, 0.26, 0.266],
14-
[3.509, 0.261, 0.258],
15-
[3.91, 0.395, 0.368],
16-
[0.813, 0.149, 0.15],
17-
[9.116, 8.631, 8.409],
18-
[8.051, 6.974, 7.173],
19-
[0.435, 0.1, 0.098],
20-
[3.065, 0.263, 0.268],
21-
[3.38, 3.057, 3.004],
22-
[6.536, 2.51, 2.426],
23-
[1.131, 0.822, 0.815],
24-
[1.182, 0.845, 0.85],
25-
[5.924, 2.349, 2.357],
26-
[4.114, 2.992, 2.978],
27-
[5.986, 2.412, 2.333],
28-
[6.246, 2.762, 2.771],
29-
[9.816, 5.644, 5.712],
30-
[7.295, 3.218, 3.142],
31-
[10.923, 9.643, 9.64],
32-
[0.499, 0.141, 0.139],
33-
[13.56, 2.807, 2.863],
34-
[15.323, 3.116, 3.064],
35-
[25.755, 7.577, 7.515],
36-
[55.781, 14.548, 14.621],
37-
[5.652, 3.587, 3.442],
38-
[11.343, 11.418, 10.956],
39-
[8.59, 6.296, 6.558],
40-
[13.576, 4.546, 4.483],
41-
[55.295, 55.039, 55],
42-
[1.92, 1.82, 1.789],
43-
[6.911, 1.77, 1.739],
44-
[7.423, 2.245, 2.302],
45-
[15.001, 12.929, 12.968],
46-
[20.967, 13.349, 13.238],
47-
[20.256, 13.594, 12.858],
48-
[3.022, 2.836, 2.788],
49-
[0.776, 0.32, 0.334],
50-
[0.57, 0.166, 0.143],
51-
[0.663, 0.168, 0.147],
52-
[0.969, 0.432, 0.435],
53-
[0.583, 0.101, 0.094],
54-
[0.559, 0.098, 0.095],
55-
[0.531, 0.086, 0.082]
56-
]
13+
[1.634, 0.104, 0.101],
14+
[0.619, 0.161, 0.161],
15+
[0.621, 0.236, 0.235],
16+
[0.882, 0.139, 0.138],
17+
[1.11, 0.886, 0.894],
18+
[1.688, 1.362, 1.354],
19+
[0.399, 0.095, 0.091],
20+
[0.413, 0.167, 0.165],
21+
[2.823, 2.525, 2.648],
22+
[2.197, 1.858, 1.862],
23+
[1.016, 0.681, 0.657],
24+
[1.055, 0.72, 0.718],
25+
[1.952, 1.639, 1.65],
26+
[5.331, 4.953, 5.027],
27+
[2.081, 1.621, 1.617],
28+
[2.579, 2.287, 2.311],
29+
[5.151, 4.115, 4.164],
30+
[3.715, 2.785, 2.748],
31+
[10.092, 8.217, 8.349],
32+
[0.46, 0.167, 0.172],
33+
[9.93, 2.271, 2.168],
34+
[11.648, 2.428, 2.362],
35+
[22.37, 5.4, 5.456],
36+
[55.694, 13.2, 13.413],
37+
[2.968, 0.638, 0.619],
38+
[1.005, 0.684, 0.707],
39+
[2.982, 0.678, 0.655],
40+
[9.942, 3.066, 3.155],
41+
[15.06, 14.569, 14.573],
42+
[1.383, 1.111, 1.117],
43+
[3.304, 1.638, 1.632],
44+
[7.52, 2.35, 2.338],
45+
[16.281, 14.01, 14.168],
46+
[15.405, 9.806, 9.841],
47+
[14.696, 8.875, 8.978],
48+
[2.799, 2.516, 2.552],
49+
[0.491, 0.191, 0.169],
50+
[0.398, 0.102, 0.097],
51+
[0.422, 0.1, 0.097],
52+
[0.639, 0.259, 0.221],
53+
[0.334, 0.042, 0.04],
54+
[0.32, 0.039, 0.04],
55+
[0.317, 0.04, 0.038]
56+
]
5757
}
5858

daft-parquet/queries.sql

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
3636
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
3737
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
3838
SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
39-
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1010;
40-
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1010;
41-
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 110;
42-
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10010;
43-
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 1010;
39+
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;

0 commit comments

Comments
 (0)