Hive’s OLAP functionality (OVER and RANK) to achieve performance, but without a Join.
CREATE TABLE clicks (
timestamp date, sessionID string, url string, source_ip string
) STORED as ORC tblproperties (“orc.compress” = “SNAPPY”);
OLD--
SELECT clicks.* FROM clicks inner join
(select sessionID, max(timestamp) as max_ts from clicks
group by sessionID) latest
ON clicks.sessionID = latest.sessionID and
clicks.timestamp = latest.max_ts;
NEW--
SELECT * FROM (SELECT *, RANK() over (partition by sessionID
order by timestamp desc) as rank FROM clicks) ranked_clicks
WHERE ranked_clicks.rank=1;
Copyright ©2022 coderraj.com. All Rights Reserved.