digraph G {
0 [labelType="html" label="<br><b>CollectLimit</b><br><br>"];
subgraph cluster1 {
isCluster="true";
label="WholeStageCodegen (4)\n \nduration: total (min, med, max (stageId: taskId))\n32 ms (2 ms, 5 ms, 13 ms (stage 12.0: task 267))";
2 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build total (min, med, max (stageId: taskId))<br>20 ms (0 ms, 2 ms, 11 ms (stage 12.0: task 267))<br>peak memory total (min, med, max (stageId: taskId))<br>257.3 MiB (256.0 KiB, 64.3 MiB, 64.3 MiB (stage 10.0: task 266))<br>number of output rows: 12<br>avg hash probe bucket list iters (min, med, max (stageId: taskId)):<br>(1, 1, 1 (stage 10.0: task 266))"];
}
3 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 1,048<br>shuffle write time total (min, med, max (stageId: taskId))<br>189 ms (6 ms, 8 ms, 15 ms (stage 9.0: task 249))<br>records read: 42<br>local bytes read total (min, med, max (stageId: taskId))<br>1416.0 B (0.0 B, 365.0 B, 537.0 B (stage 10.0: task 266))<br>fetch wait time total (min, med, max (stageId: taskId))<br>3 ms (0 ms, 0 ms, 3 ms (stage 12.0: task 269))<br>remote bytes read total (min, med, max (stageId: taskId))<br>2016.0 B (0.0 B, 481.0 B, 691.0 B (stage 12.0: task 269))<br>local blocks read: 13<br>remote blocks read: 18<br>data size total (min, med, max (stageId: taskId))<br>63.9 KiB (2.7 KiB, 3.1 KiB, 5.6 KiB (stage 9.0: task 264))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>94.8 KiB (3.9 KiB, 4.6 KiB, 8.1 KiB (stage 9.0: task 264))"];
subgraph cluster4 {
isCluster="true";
label="WholeStageCodegen (3)\n \nduration: total (min, med, max (stageId: taskId))\n1.5 m (1.5 s, 5.1 s, 5.9 s (stage 9.0: task 246))";
5 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build total (min, med, max (stageId: taskId))<br>1.5 m (1.5 s, 5.0 s, 5.8 s (stage 9.0: task 246))<br>peak memory total (min, med, max (stageId: taskId))<br>5.0 MiB (256.0 KiB, 256.0 KiB, 256.0 KiB (stage 9.0: task 247))<br>number of output rows: 1,048"];
6 [labelType="html" label="<br><b>Project</b><br><br>"];
7 [labelType="html" label="<b>BroadcastHashJoin</b><br><br>number of output rows: 22,400,728"];
8 [labelType="html" label="<br><b>Project</b><br><br>"];
9 [labelType="html" label="<b>BroadcastHashJoin</b><br><br>number of output rows: 22,400,728"];
10 [labelType="html" label="<br><b>Project</b><br><br>"];
11 [labelType="html" label="<b>Filter</b><br><br>number of output rows: 22,400,728"];
}
12 [labelType="html" label="<b>Scan csv </b><br><br>number of files read: 7<br>metadata time: 0 ms<br>size of files read: 2.2 GiB<br>number of output rows: 22,400,728"];
13 [labelType="html" label="<b>BroadcastExchange</b><br><br>data size: 1040.0 KiB<br>time to collect: 214 ms<br>time to build: 10 ms<br>time to broadcast: 4 ms"];
subgraph cluster14 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: 103 ms";
15 [labelType="html" label="<br><b>Project</b><br><br>"];
16 [labelType="html" label="<b>Filter</b><br><br>number of output rows: 265"];
}
17 [labelType="html" label="<b>Scan csv </b><br><br>number of files read: 1<br>metadata time: 0 ms<br>size of files read: 10.5 KiB<br>number of output rows: 265"];
2->0;
3->2;
5->3;
6->5;
7->6;
8->7;
9->8;
10->9;
11->10;
12->11;
13->9;
15->13;
16->15;
17->16;
13->7;
}
18
CollectLimit 11
HashAggregate(keys=[Month#378, route#356], functions=[sum(cast(tip_amount#50 as double)), sum(cast(passenger_count#42 as double))])
WholeStageCodegen (4)
Exchange hashpartitioning(Month#378, route#356, 200), true, [id=#289]
HashAggregate(keys=[Month#378, route#356], functions=[partial_sum(cast(tip_amount#50 as double)), partial_sum(cast(passenger_count#42 as double))])
Project [passenger_count#42, tip_amount#50, concat_ws( to , Pickup_Borough#206, Borough#17) AS route#356, month(cast(tpep_pickup_datetime#40 as date)) AS Month#378]
BroadcastHashJoin [DOLocationID#45], [LocationID#16], Inner, BuildRight
Project [tpep_pickup_datetime#40, passenger_count#42, DOLocationID#45, tip_amount#50, Borough#17 AS Pickup_Borough#206]
BroadcastHashJoin [PULocationID#44], [LocationID#16], Inner, BuildRight
Project [tpep_pickup_datetime#40, passenger_count#42, PULocationID#44, DOLocationID#45, tip_amount#50]
Filter (isnotnull(PULocationID#44) AND isnotnull(DOLocationID#45))
WholeStageCodegen (3)
FileScan csv [tpep_pickup_datetime#40,passenger_count#42,PULocationID#44,DOLocationID#45,tip_amount#50] Batched: false, DataFilters: [isnotnull(PULocationID#44), isnotnull(DOLocationID#45)], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/yellow_tripdata/2023], PartitionFilters: [], PushedFilters: [IsNotNull(PULocationID), IsNotNull(DOLocationID)], ReadSchema: struct<tpep_pickup_datetime:string,passenger_count:string,PULocationID:string,DOLocationID:string...
BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#248]
Project [LocationID#16, Borough#17]
Filter isnotnull(LocationID#16)
WholeStageCodegen (1)
FileScan csv [LocationID#16,Borough#17] Batched: false, DataFilters: [isnotnull(LocationID#16)], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi], PartitionFilters: [], PushedFilters: [IsNotNull(LocationID)], ReadSchema: struct<LocationID:string,Borough:string>
== Parsed Logical Plan ==
GlobalLimit 11
+- LocalLimit 11
+- Project [cast(Month#378 as string) AS Month#551, cast(route#356 as string) AS route#552, cast(total_tip_amount#533 as string) AS total_tip_amount#553, cast(total_passenger_count#535 as string) AS total_passenger_count#554, cast(average_tip_per_passenger#540 as string) AS average_tip_per_passenger#555]
+- Project [Month#378, route#356, total_tip_amount#533, total_passenger_count#535, (total_tip_amount#533 / total_passenger_count#535) AS average_tip_per_passenger#540]
+- Aggregate [Month#378, route#356], [Month#378, route#356, sum(cast(tip_amount#50 as double)) AS total_tip_amount#533, sum(cast(passenger_count#42 as double)) AS total_passenger_count#535]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Dropoff_Borough#312, Dropoff_Zone#313, Dropoff_service_zone#314, route#356, month(cast(tpep_pickup_datetime#40 as date)) AS Month#378]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Dropoff_Borough#312, Dropoff_Zone#313, Dropoff_service_zone#314, concat_ws( to , Pickup_Borough#206, Dropoff_Borough#312) AS route#356]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Dropoff_Borough#312, Dropoff_Zone#313, Dropoff_service_zone#314]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, DOLocationID#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Borough#17 AS Dropoff_Borough#312, Zone#18 AS Dropoff_Zone#313, service_zone#19 AS Dropoff_service_zone#314]
+- Join Inner, (DOLocationID#45 = LocationID#16)
:- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, DOLocationID#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208]
: +- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, PULocationID#44, DOLocationID#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Borough#17 AS Pickup_Borough#206, Zone#18 AS Pickup_Zone#207, service_zone#19 AS Pickup_service_zone#208]
: +- Join Inner, (PULocationID#44 = LocationID#16)
: :- Relation[tpep_pickup_datetime#40,tpep_dropoff_datetime#41,passenger_count#42,trip_distance#43,PULocationID#44,DOLocationID#45,payment_type#46,fare_amount#47,extra#48,mta_tax#49,tip_amount#50,tolls_amount#51,total_amount#52,congestion_surcharge#53,airport_fee#54,taxi_type#55] csv
: +- Relation[LocationID#16,Borough#17,Zone#18,service_zone#19] csv
+- Relation[LocationID#16,Borough#17,Zone#18,service_zone#19] csv
== Analyzed Logical Plan ==
Month: string, route: string, total_tip_amount: string, total_passenger_count: string, average_tip_per_passenger: string
GlobalLimit 11
+- LocalLimit 11
+- Project [cast(Month#378 as string) AS Month#551, cast(route#356 as string) AS route#552, cast(total_tip_amount#533 as string) AS total_tip_amount#553, cast(total_passenger_count#535 as string) AS total_passenger_count#554, cast(average_tip_per_passenger#540 as string) AS average_tip_per_passenger#555]
+- Project [Month#378, route#356, total_tip_amount#533, total_passenger_count#535, (total_tip_amount#533 / total_passenger_count#535) AS average_tip_per_passenger#540]
+- Aggregate [Month#378, route#356], [Month#378, route#356, sum(cast(tip_amount#50 as double)) AS total_tip_amount#533, sum(cast(passenger_count#42 as double)) AS total_passenger_count#535]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Dropoff_Borough#312, Dropoff_Zone#313, Dropoff_service_zone#314, route#356, month(cast(tpep_pickup_datetime#40 as date)) AS Month#378]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Dropoff_Borough#312, Dropoff_Zone#313, Dropoff_service_zone#314, concat_ws( to , Pickup_Borough#206, Dropoff_Borough#312) AS route#356]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Dropoff_Borough#312, Dropoff_Zone#313, Dropoff_service_zone#314]
+- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, DOLocationID#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208, Borough#17 AS Dropoff_Borough#312, Zone#18 AS Dropoff_Zone#313, service_zone#19 AS Dropoff_service_zone#314]
+- Join Inner, (DOLocationID#45 = LocationID#16)
:- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, DOLocationID#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Pickup_Borough#206, Pickup_Zone#207, Pickup_service_zone#208]
: +- Project [tpep_pickup_datetime#40, tpep_dropoff_datetime#41, passenger_count#42, trip_distance#43, PULocationID#44, DOLocationID#45, payment_type#46, fare_amount#47, extra#48, mta_tax#49, tip_amount#50, tolls_amount#51, total_amount#52, congestion_surcharge#53, airport_fee#54, taxi_type#55, Borough#17 AS Pickup_Borough#206, Zone#18 AS Pickup_Zone#207, service_zone#19 AS Pickup_service_zone#208]
: +- Join Inner, (PULocationID#44 = LocationID#16)
: :- Relation[tpep_pickup_datetime#40,tpep_dropoff_datetime#41,passenger_count#42,trip_distance#43,PULocationID#44,DOLocationID#45,payment_type#46,fare_amount#47,extra#48,mta_tax#49,tip_amount#50,tolls_amount#51,total_amount#52,congestion_surcharge#53,airport_fee#54,taxi_type#55] csv
: +- Relation[LocationID#16,Borough#17,Zone#18,service_zone#19] csv
+- Relation[LocationID#16,Borough#17,Zone#18,service_zone#19] csv
== Optimized Logical Plan ==
GlobalLimit 11
+- LocalLimit 11
+- Aggregate [Month#378, route#356], [cast(Month#378 as string) AS Month#551, route#356, cast(sum(cast(tip_amount#50 as double)) as string) AS total_tip_amount#553, cast(sum(cast(passenger_count#42 as double)) as string) AS total_passenger_count#554, cast((sum(cast(tip_amount#50 as double)) / sum(cast(passenger_count#42 as double))) as string) AS average_tip_per_passenger#555]
+- Project [passenger_count#42, tip_amount#50, concat_ws( to , Pickup_Borough#206, Borough#17) AS route#356, month(cast(tpep_pickup_datetime#40 as date)) AS Month#378]
+- Join Inner, (DOLocationID#45 = LocationID#16)
:- Project [tpep_pickup_datetime#40, passenger_count#42, DOLocationID#45, tip_amount#50, Borough#17 AS Pickup_Borough#206]
: +- Join Inner, (PULocationID#44 = LocationID#16)
: :- Project [tpep_pickup_datetime#40, passenger_count#42, PULocationID#44, DOLocationID#45, tip_amount#50]
: : +- Filter (isnotnull(PULocationID#44) AND isnotnull(DOLocationID#45))
: : +- Relation[tpep_pickup_datetime#40,tpep_dropoff_datetime#41,passenger_count#42,trip_distance#43,PULocationID#44,DOLocationID#45,payment_type#46,fare_amount#47,extra#48,mta_tax#49,tip_amount#50,tolls_amount#51,total_amount#52,congestion_surcharge#53,airport_fee#54,taxi_type#55] csv
: +- Project [LocationID#16, Borough#17]
: +- Filter isnotnull(LocationID#16)
: +- Relation[LocationID#16,Borough#17,Zone#18,service_zone#19] csv
+- Project [LocationID#16, Borough#17]
+- Filter isnotnull(LocationID#16)
+- Relation[LocationID#16,Borough#17,Zone#18,service_zone#19] csv
== Physical Plan ==
CollectLimit 11
+- *(4) HashAggregate(keys=[Month#378, route#356], functions=[sum(cast(tip_amount#50 as double)), sum(cast(passenger_count#42 as double))], output=[Month#551, route#356, total_tip_amount#553, total_passenger_count#554, average_tip_per_passenger#555])
+- Exchange hashpartitioning(Month#378, route#356, 200), true, [id=#289]
+- *(3) HashAggregate(keys=[Month#378, route#356], functions=[partial_sum(cast(tip_amount#50 as double)), partial_sum(cast(passenger_count#42 as double))], output=[Month#378, route#356, sum#563, sum#564])
+- *(3) Project [passenger_count#42, tip_amount#50, concat_ws( to , Pickup_Borough#206, Borough#17) AS route#356, month(cast(tpep_pickup_datetime#40 as date)) AS Month#378]
+- *(3) BroadcastHashJoin [DOLocationID#45], [LocationID#16], Inner, BuildRight
:- *(3) Project [tpep_pickup_datetime#40, passenger_count#42, DOLocationID#45, tip_amount#50, Borough#17 AS Pickup_Borough#206]
: +- *(3) BroadcastHashJoin [PULocationID#44], [LocationID#16], Inner, BuildRight
: :- *(3) Project [tpep_pickup_datetime#40, passenger_count#42, PULocationID#44, DOLocationID#45, tip_amount#50]
: : +- *(3) Filter (isnotnull(PULocationID#44) AND isnotnull(DOLocationID#45))
: : +- FileScan csv [tpep_pickup_datetime#40,passenger_count#42,PULocationID#44,DOLocationID#45,tip_amount#50] Batched: false, DataFilters: [isnotnull(PULocationID#44), isnotnull(DOLocationID#45)], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/yellow_tripdata/2023], PartitionFilters: [], PushedFilters: [IsNotNull(PULocationID), IsNotNull(DOLocationID)], ReadSchema: struct<tpep_pickup_datetime:string,passenger_count:string,PULocationID:string,DOLocationID:string...
: +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#248]
: +- *(1) Project [LocationID#16, Borough#17]
: +- *(1) Filter isnotnull(LocationID#16)
: +- FileScan csv [LocationID#16,Borough#17] Batched: false, DataFilters: [isnotnull(LocationID#16)], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi], PartitionFilters: [], PushedFilters: [IsNotNull(LocationID)], ReadSchema: struct<LocationID:string,Borough:string>
+- ReusedExchange [LocationID#16, Borough#17], BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#248]