This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 818e7390b6 add extend sql for issue-11212 (#15500)
818e7390b6 is described below

commit 818e7390b650b4f2ba71e09f99ef0b39406bac0a
Author: LB7666 <[email protected]>
AuthorDate: Tue Apr 1 02:32:44 2025 +0800

    add extend sql for issue-11212 (#15500)
---
 benchmarks/queries/clickbench/README.md    | 33 +++++++++++++++++++++++++++++-
 benchmarks/queries/clickbench/extended.sql |  3 ++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/benchmarks/queries/clickbench/README.md 
b/benchmarks/queries/clickbench/README.md
index 6797797409..2032427e1e 100644
--- a/benchmarks/queries/clickbench/README.md
+++ b/benchmarks/queries/clickbench/README.md
@@ -93,12 +93,14 @@ LIMIT 10;
 
 Results look like
 
+```
 +-------------+---------------------+---+------+------+------+
 | ClientIP    | WatchID             | c | tmin | tmed | tmax |
 +-------------+---------------------+---+------+------+------+
 | 1611957945  | 6655575552203051303 | 2 | 0    | 0    | 0    |
 | -1402644643 | 8566928176839891583 | 2 | 0    | 0    | 0    |
 +-------------+---------------------+---+------+------+------+
+```
 
 
 ### Q5: Response start time distribution analysis (p95)
@@ -120,13 +122,42 @@ LIMIT 10;
 ```
 
 Results look like
-
+```
 +-------------+---------------------+---+------+------+------+
 | ClientIP    | WatchID             | c | tmin | tp95 | tmax |
 +-------------+---------------------+---+------+------+------+
 | 1611957945  | 6655575552203051303 | 2 | 0    | 0    | 0    |
 | -1402644643 | 8566928176839891583 | 2 | 0    | 0    | 0    |
 +-------------+---------------------+---+------+------+------+
+```
+
+### Q6: How many social shares meet complex multi-stage filtering criteria?
+**Question**: What is the count of sharing actions from iPhone mobile users on 
specific social networks, within common timezones, participating in seasonal 
campaigns, with high screen resolutions and closely matched UTM parameters?
+**Important Query Properties**: Simple filter with high-selectivity, Costly 
string matching, A large number of filters with high overhead are positioned 
relatively later in the process
+
+```sql
+SELECT COUNT(*) AS ShareCount
+FROM hits
+WHERE
+       -- Stage 1: High-selectivity filters (fast)
+    "IsMobile" = 1 -- Filter mobile users
+    AND "MobilePhoneModel" LIKE 'iPhone%' -- Match iPhone models
+    AND "SocialAction" = 'share' -- Identify social sharing actions
+
+       -- Stage 2: Moderate filters (cheap)
+    AND "SocialSourceNetworkID" IN (5, 12) -- Filter specific social networks
+    AND "ClientTimeZone" BETWEEN -5 AND 5 -- Restrict to common timezones
+
+       -- Stage 3: Heavy computations (expensive)
+    AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT 
NULL -- Find campaign-specific referrers
+    AND CASE 
+        WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' 
+        THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT 
+        ELSE 0 
+    END > 1920 -- Extract and validate resolution parameter
+    AND levenshtein("UTMSource", "UTMCampaign") < 3 -- Verify UTM parameter 
similarity
+```
+Result is empty,Since it has already been filtered by `"SocialAction" = 
'share'`.
 
 
 ## Data Notes
diff --git a/benchmarks/queries/clickbench/extended.sql 
b/benchmarks/queries/clickbench/extended.sql
index fbabaf2a70..ef3a409c9c 100644
--- a/benchmarks/queries/clickbench/extended.sql
+++ b/benchmarks/queries/clickbench/extended.sql
@@ -3,4 +3,5 @@ SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT 
"BrowserCountry"), COUNT(DISTI
 SELECT "BrowserCountry",  COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT 
"HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") 
FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10;
 SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), 
AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice")  FROM hits 
GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s 
DESC LIMIT 10;
 SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, 
MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits 
WHERE "JavaEnable" = 0  GROUP BY  "ClientIP", "WatchID" HAVING c > 1 ORDER BY 
tmed DESC LIMIT 10;
-SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, 
APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, 
MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY  
"ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
\ No newline at end of file
+SELECT "ClientIP", "WatchID",  COUNT(*) c, MIN("ResponseStartTiming") tmin, 
APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, 
MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY  
"ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10;
+SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND 
"MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND 
"SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND 
regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND 
CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' 
THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 
1920 AND levenshtein("UTMSource", "UTMCampaign") < 3;
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to