digraph G {
0 [labelType="html" label="<br><b>CollectLimit</b><br><br>"];
subgraph cluster1 {
isCluster="true";
label="WholeStageCodegen (9)\n \nduration: total (min, med, max (stageId: taskId))\n13 ms (0 ms, 3 ms, 4 ms (stage 18.0: task 673))";
2 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build total (min, med, max (stageId: taskId))<br>8 ms (0 ms, 2 ms, 3 ms (stage 18.0: task 673))<br>peak memory total (min, med, max (stageId: taskId))<br>257.3 MiB (256.0 KiB, 64.3 MiB, 64.3 MiB (stage 13.0: task 670))<br>number of output rows: 12<br>avg hash probe bucket list iters (min, med, max (stageId: taskId)):<br>(1, 1, 1 (stage 13.0: task 670))"];
}
3 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 8,528<br>shuffle write time total (min, med, max (stageId: taskId))<br>672 ms (0 ms, 4 ms, 8 ms (stage 12.0: task 613))<br>records read: 416<br>local bytes read total (min, med, max (stageId: taskId))<br>15.5 KiB (0.0 B, 3.7 KiB, 6.1 KiB (stage 18.0: task 671))<br>fetch wait time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 13.0: task 670))<br>remote bytes read total (min, med, max (stageId: taskId))<br>15.2 KiB (0.0 B, 3.5 KiB, 5.7 KiB (stage 18.0: task 671))<br>local blocks read: 136<br>remote blocks read: 137<br>data size total (min, med, max (stageId: taskId))<br>523.5 KiB (0.0 B, 2.3 KiB, 7.3 KiB (stage 12.0: task 494))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>744.8 KiB (0.0 B, 3.3 KiB, 10.1 KiB (stage 12.0: task 494))"];
subgraph cluster4 {
isCluster="true";
label="WholeStageCodegen (8)\n \nduration: total (min, med, max (stageId: taskId))\n25.0 s (1 ms, 18 ms, 1.3 s (stage 12.0: task 557))";
5 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build total (min, med, max (stageId: taskId))<br>23.8 s (0 ms, 11 ms, 1.3 s (stage 12.0: task 557))<br>peak memory total (min, med, max (stageId: taskId))<br>50.0 MiB (256.0 KiB, 256.0 KiB, 256.0 KiB (stage 12.0: task 471))<br>number of output rows: 8,528"];
6 [labelType="html" label="<br><b>Project</b><br><br>"];
}
7 [labelType="html" label="<b>SortMergeJoin</b><br><br>number of output rows: 22,400,728"];
subgraph cluster8 {
isCluster="true";
label="WholeStageCodegen (5)\n \nduration: total (min, med, max (stageId: taskId))\n25.6 s (3 ms, 20 ms, 1.3 s (stage 12.0: task 557))";
9 [labelType="html" label="<b>Sort</b><br><br>sort time total (min, med, max (stageId: taskId))<br>119 ms (0 ms, 0 ms, 20 ms (stage 12.0: task 557))<br>peak memory total (min, med, max (stageId: taskId))<br>10.8 GiB (64.0 KiB, 64.3 MiB, 256.0 MiB (stage 12.0: task 557))<br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 12.0: task 471))"];
}
10 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 22,400,728<br>shuffle write time total (min, med, max (stageId: taskId))<br>1.5 s (0 ms, 6 ms, 41 ms (stage 11.0: task 357))<br>records read: 22,400,728<br>local bytes read total (min, med, max (stageId: taskId))<br>191.4 MiB (0.0 B, 72.0 KiB, 11.7 MiB (stage 12.0: task 557))<br>fetch wait time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 12.0: task 471))<br>remote bytes read total (min, med, max (stageId: taskId))<br>192.2 MiB (0.0 B, 77.1 KiB, 11.6 MiB (stage 12.0: task 557))<br>local blocks read: 8,577<br>remote blocks read: 8,717<br>data size total (min, med, max (stageId: taskId))<br>2.3 GiB (0.0 B, 152.6 KiB, 154.1 MiB (stage 11.0: task 357))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>383.6 MiB (0.0 B, 41.5 KiB, 24.5 MiB (stage 11.0: task 357))"];
subgraph cluster11 {
isCluster="true";
label="WholeStageCodegen (4)\n \nduration: total (min, med, max (stageId: taskId))\n24.2 s (0 ms, 10 ms, 1.8 s (stage 11.0: task 357))";
12 [labelType="html" label="<br><b>Project</b><br><br>"];
}
13 [labelType="html" label="<b>SortMergeJoin</b><br><br>number of output rows: 22,400,728"];
subgraph cluster14 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: total (min, med, max (stageId: taskId))\n25.4 s (3 ms, 14 ms, 1.8 s (stage 11.0: task 357))";
15 [labelType="html" label="<b>Sort</b><br><br>sort time total (min, med, max (stageId: taskId))<br>151 ms (0 ms, 0 ms, 25 ms (stage 11.0: task 357))<br>peak memory total (min, med, max (stageId: taskId))<br>11.0 GiB (64.0 KiB, 64.1 MiB, 256.0 MiB (stage 11.0: task 357))<br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 11.0: task 271))"];
}
16 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 22,400,728<br>shuffle write time total (min, med, max (stageId: taskId))<br>841 ms (24 ms, 42 ms, 54 ms (stage 9.0: task 255))<br>records read: 22,400,728<br>local bytes read total (min, med, max (stageId: taskId))<br>227.1 MiB (0.0 B, 18.3 KiB, 15.2 MiB (stage 11.0: task 368))<br>fetch wait time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 11.0: task 271))<br>remote bytes read total (min, med, max (stageId: taskId))<br>223.0 MiB (0.0 B, 14.7 KiB, 17.1 MiB (stage 11.0: task 357))<br>local blocks read: 1,468<br>remote blocks read: 1,381<br>data size total (min, med, max (stageId: taskId))<br>2.2 GiB (34.9 MiB, 126.4 MiB, 126.5 MiB (stage 9.0: task 252))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>450.1 MiB (7.3 MiB, 25.6 MiB, 25.6 MiB (stage 9.0: task 257))"];
17 [labelType="html" label="<b>Scan csv </b><br><br>number of files read: 7<br>metadata time: 0 ms<br>size of files read: 2.2 GiB<br>number of output rows: 22,400,728"];
subgraph cluster18 {
isCluster="true";
label="WholeStageCodegen (3)\n \nduration: total (min, med, max (stageId: taskId))\n14.7 s (1 ms, 6 ms, 1.6 s (stage 11.0: task 357))";
19 [labelType="html" label="<b>Sort</b><br><br>sort time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 11.0: task 271))<br>peak memory total (min, med, max (stageId: taskId))<br>9.1 GiB (64.0 KiB, 64.1 MiB, 64.1 MiB (stage 11.0: task 271))<br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 11.0: task 271))"];
}
20 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 265<br>shuffle write time total (min, med, max (stageId: taskId))<br>16 ms (7 ms, 8 ms, 8 ms (stage 10.0: task 249))<br>records read: 530<br>local bytes read total (min, med, max (stageId: taskId))<br>15.8 KiB (0.0 B, 0.0 B, 127.0 B (stage 11.0: task 380))<br>fetch wait time total (min, med, max (stageId: taskId))<br>877 ms (0 ms, 0 ms, 29 ms (stage 11.0: task 352))<br>remote bytes read total (min, med, max (stageId: taskId))<br>16.1 KiB (0.0 B, 72.0 B, 151.0 B (stage 12.0: task 494))<br>local blocks read: 197<br>remote blocks read: 201<br>data size total (min, med, max (stageId: taskId))<br>9.0 KiB (4.4 KiB, 4.6 KiB, 4.6 KiB (stage 10.0: task 248))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>16.0 KiB (7.8 KiB, 8.2 KiB, 8.2 KiB (stage 10.0: task 248))"];
subgraph cluster21 {
isCluster="true";
label="WholeStageCodegen (2)\n \nduration: total (min, med, max (stageId: taskId))\n70 ms (33 ms, 37 ms, 37 ms (stage 10.0: task 249))";
22 [labelType="html" label="<br><b>Project</b><br><br>"];
23 [labelType="html" label="<b>Filter</b><br><br>number of output rows: 265"];
24 [labelType="html" label="<b>Scan ExistingRDD</b><br><br>number of output rows: 265"];
}
subgraph cluster25 {
isCluster="true";
label="WholeStageCodegen (7)\n \nduration: total (min, med, max (stageId: taskId))\n14.5 s (1 ms, 12 ms, 1.1 s (stage 12.0: task 557))";
26 [labelType="html" label="<b>Sort</b><br><br>sort time total (min, med, max (stageId: taskId))<br>0 ms (0 ms, 0 ms, 0 ms (stage 12.0: task 471))<br>peak memory total (min, med, max (stageId: taskId))<br>9.1 GiB (64.0 KiB, 64.1 MiB, 64.1 MiB (stage 12.0: task 471))<br>spill size total (min, med, max (stageId: taskId))<br>0.0 B (0.0 B, 0.0 B, 0.0 B (stage 12.0: task 471))"];
}
2->0;
3->2;
5->3;
6->5;
7->6;
9->7;
10->9;
12->10;
13->12;
15->13;
16->15;
17->16;
19->13;
20->19;
22->20;
23->22;
24->23;
26->7;
20->26;
}
27
CollectLimit 11
HashAggregate(keys=[Month#405, route#383], functions=[sum(cast(tip_amount#65 as double)), sum(cast(passenger_count#57 as double))])
WholeStageCodegen (9)
Exchange hashpartitioning(Month#405, route#383, 200), true, [id=#303]
HashAggregate(keys=[Month#405, route#383], functions=[partial_sum(cast(tip_amount#65 as double)), partial_sum(cast(passenger_count#57 as double))])
Project [passenger_count#57, tip_amount#65, concat_ws( to , Pickup_Borough#147, Borough#15) AS route#383, month(cast(tpep_pickup_datetime#55 as date)) AS Month#405]
WholeStageCodegen (8)
SortMergeJoin [cast(DOLocationID#60 as int)], [LocationID#14], LeftOuter
Sort [cast(DOLocationID#60 as int) ASC NULLS FIRST], false, 0
WholeStageCodegen (5)
Exchange hashpartitioning(cast(DOLocationID#60 as int), 200), true, [id=#259]
Project [tpep_pickup_datetime#55, passenger_count#57, DOLocationID#60, tip_amount#65, Borough#15 AS Pickup_Borough#147]
WholeStageCodegen (4)
SortMergeJoin [cast(PULocationID#59 as int)], [LocationID#14], LeftOuter
Sort [cast(PULocationID#59 as int) ASC NULLS FIRST], false, 0
WholeStageCodegen (1)
Exchange hashpartitioning(cast(PULocationID#59 as int), 200), true, [id=#228]
FileScan csv [tpep_pickup_datetime#55,passenger_count#57,PULocationID#59,DOLocationID#60,tip_amount#65] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/yellow_tripdata/2023/yellow_tripdata_..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<tpep_pickup_datetime:string,passenger_count:string,PULocationID:string,DOLocationID:string...
Sort [LocationID#14 ASC NULLS FIRST], false, 0
WholeStageCodegen (3)
Exchange hashpartitioning(LocationID#14, 200), true, [id=#251]
Project [LocationID#14, Borough#15]
Filter isnotnull(LocationID#14)
Scan ExistingRDD[LocationID#14,Borough#15,Zone#16,service_zone#17]
WholeStageCodegen (2)
Sort [LocationID#14 ASC NULLS FIRST], false, 0
WholeStageCodegen (7)
== Parsed Logical Plan ==
GlobalLimit 11
+- LocalLimit 11
+- Project [cast(Month#405 as string) AS Month#574, cast(route#383 as string) AS route#575, cast(total_tip_amount#556 as string) AS total_tip_amount#576, cast(total_passenger_count#558 as string) AS total_passenger_count#577, cast(average_tip_per_passenger#563 as string) AS average_tip_per_passenger#578]
+- Project [Month#405, route#383, total_tip_amount#556, total_passenger_count#558, round((total_tip_amount#556 / total_passenger_count#558), 2) AS average_tip_per_passenger#563]
+- Aggregate [Month#405, route#383], [Month#405, route#383, round(sum(cast(tip_amount#65 as double)), 2) AS total_tip_amount#556, sum(cast(passenger_count#57 as double)) AS total_passenger_count#558]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, Dropoff_Borough#294, Dropoff_Zone#317, Dropoff_service_zone#340, route#383, month(cast(tpep_pickup_datetime#55 as date)) AS Month#405]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, Dropoff_Borough#294, Dropoff_Zone#317, Dropoff_service_zone#340, concat_ws( to , Pickup_Borough#147, Dropoff_Borough#294) AS route#383]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, Dropoff_Borough#294, Dropoff_Zone#317, Dropoff_service_zone#340]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, LocationID#14, Dropoff_Borough#294, Dropoff_Zone#317, service_zone#17 AS Dropoff_service_zone#340]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, LocationID#14, Dropoff_Borough#294, Zone#16 AS Dropoff_Zone#317, service_zone#17]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, LocationID#14, Borough#15 AS Dropoff_Borough#294, Zone#16, service_zone#17]
+- Join LeftOuter, (cast(DOLocationID#60 as int) = LocationID#14)
:- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189]
: +- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, PULocationID#59, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, LocationID#14, Pickup_Borough#147, Pickup_Zone#168, service_zone#17 AS Pickup_service_zone#189]
: +- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, PULocationID#59, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, LocationID#14, Pickup_Borough#147, Zone#16 AS Pickup_Zone#168, service_zone#17]
: +- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, PULocationID#59, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, LocationID#14, Borough#15 AS Pickup_Borough#147, Zone#16, service_zone#17]
: +- Join LeftOuter, (cast(PULocationID#59 as int) = LocationID#14)
: :- Relation[tpep_pickup_datetime#55,tpep_dropoff_datetime#56,passenger_count#57,trip_distance#58,PULocationID#59,DOLocationID#60,payment_type#61,fare_amount#62,extra#63,mta_tax#64,tip_amount#65,tolls_amount#66,total_amount#67,congestion_surcharge#68,airport_fee#69,taxi_type#70] csv
: +- LogicalRDD [LocationID#14, Borough#15, Zone#16, service_zone#17], false
+- LogicalRDD [LocationID#14, Borough#15, Zone#16, service_zone#17], false
== Analyzed Logical Plan ==
Month: string, route: string, total_tip_amount: string, total_passenger_count: string, average_tip_per_passenger: string
GlobalLimit 11
+- LocalLimit 11
+- Project [cast(Month#405 as string) AS Month#574, cast(route#383 as string) AS route#575, cast(total_tip_amount#556 as string) AS total_tip_amount#576, cast(total_passenger_count#558 as string) AS total_passenger_count#577, cast(average_tip_per_passenger#563 as string) AS average_tip_per_passenger#578]
+- Project [Month#405, route#383, total_tip_amount#556, total_passenger_count#558, round((total_tip_amount#556 / total_passenger_count#558), 2) AS average_tip_per_passenger#563]
+- Aggregate [Month#405, route#383], [Month#405, route#383, round(sum(cast(tip_amount#65 as double)), 2) AS total_tip_amount#556, sum(cast(passenger_count#57 as double)) AS total_passenger_count#558]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, Dropoff_Borough#294, Dropoff_Zone#317, Dropoff_service_zone#340, route#383, month(cast(tpep_pickup_datetime#55 as date)) AS Month#405]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, Dropoff_Borough#294, Dropoff_Zone#317, Dropoff_service_zone#340, concat_ws( to , Pickup_Borough#147, Dropoff_Borough#294) AS route#383]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, Dropoff_Borough#294, Dropoff_Zone#317, Dropoff_service_zone#340]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, LocationID#14, Dropoff_Borough#294, Dropoff_Zone#317, service_zone#17 AS Dropoff_service_zone#340]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, LocationID#14, Dropoff_Borough#294, Zone#16 AS Dropoff_Zone#317, service_zone#17]
+- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189, LocationID#14, Borough#15 AS Dropoff_Borough#294, Zone#16, service_zone#17]
+- Join LeftOuter, (cast(DOLocationID#60 as int) = LocationID#14)
:- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, Pickup_Borough#147, Pickup_Zone#168, Pickup_service_zone#189]
: +- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, PULocationID#59, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, LocationID#14, Pickup_Borough#147, Pickup_Zone#168, service_zone#17 AS Pickup_service_zone#189]
: +- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, PULocationID#59, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, LocationID#14, Pickup_Borough#147, Zone#16 AS Pickup_Zone#168, service_zone#17]
: +- Project [tpep_pickup_datetime#55, tpep_dropoff_datetime#56, passenger_count#57, trip_distance#58, PULocationID#59, DOLocationID#60, payment_type#61, fare_amount#62, extra#63, mta_tax#64, tip_amount#65, tolls_amount#66, total_amount#67, congestion_surcharge#68, airport_fee#69, taxi_type#70, LocationID#14, Borough#15 AS Pickup_Borough#147, Zone#16, service_zone#17]
: +- Join LeftOuter, (cast(PULocationID#59 as int) = LocationID#14)
: :- Relation[tpep_pickup_datetime#55,tpep_dropoff_datetime#56,passenger_count#57,trip_distance#58,PULocationID#59,DOLocationID#60,payment_type#61,fare_amount#62,extra#63,mta_tax#64,tip_amount#65,tolls_amount#66,total_amount#67,congestion_surcharge#68,airport_fee#69,taxi_type#70] csv
: +- LogicalRDD [LocationID#14, Borough#15, Zone#16, service_zone#17], false
+- LogicalRDD [LocationID#14, Borough#15, Zone#16, service_zone#17], false
== Optimized Logical Plan ==
GlobalLimit 11
+- LocalLimit 11
+- Aggregate [Month#405, route#383], [cast(Month#405 as string) AS Month#574, route#383, cast(round(sum(cast(tip_amount#65 as double)), 2) as string) AS total_tip_amount#576, cast(sum(cast(passenger_count#57 as double)) as string) AS total_passenger_count#577, cast(round((round(sum(cast(tip_amount#65 as double)), 2) / sum(cast(passenger_count#57 as double))), 2) as string) AS average_tip_per_passenger#578]
+- Project [passenger_count#57, tip_amount#65, concat_ws( to , Pickup_Borough#147, Borough#15) AS route#383, month(cast(tpep_pickup_datetime#55 as date)) AS Month#405]
+- Join LeftOuter, (cast(DOLocationID#60 as int) = LocationID#14)
:- Project [tpep_pickup_datetime#55, passenger_count#57, DOLocationID#60, tip_amount#65, Borough#15 AS Pickup_Borough#147]
: +- Join LeftOuter, (cast(PULocationID#59 as int) = LocationID#14)
: :- Project [tpep_pickup_datetime#55, passenger_count#57, PULocationID#59, DOLocationID#60, tip_amount#65]
: : +- Relation[tpep_pickup_datetime#55,tpep_dropoff_datetime#56,passenger_count#57,trip_distance#58,PULocationID#59,DOLocationID#60,payment_type#61,fare_amount#62,extra#63,mta_tax#64,tip_amount#65,tolls_amount#66,total_amount#67,congestion_surcharge#68,airport_fee#69,taxi_type#70] csv
: +- Project [LocationID#14, Borough#15]
: +- Filter isnotnull(LocationID#14)
: +- LogicalRDD [LocationID#14, Borough#15, Zone#16, service_zone#17], false
+- Project [LocationID#14, Borough#15]
+- Filter isnotnull(LocationID#14)
+- LogicalRDD [LocationID#14, Borough#15, Zone#16, service_zone#17], false
== Physical Plan ==
CollectLimit 11
+- *(9) HashAggregate(keys=[Month#405, route#383], functions=[sum(cast(tip_amount#65 as double)), sum(cast(passenger_count#57 as double))], output=[Month#574, route#383, total_tip_amount#576, total_passenger_count#577, average_tip_per_passenger#578])
+- Exchange hashpartitioning(Month#405, route#383, 200), true, [id=#303]
+- *(8) HashAggregate(keys=[Month#405, route#383], functions=[partial_sum(cast(tip_amount#65 as double)), partial_sum(cast(passenger_count#57 as double))], output=[Month#405, route#383, sum#586, sum#587])
+- *(8) Project [passenger_count#57, tip_amount#65, concat_ws( to , Pickup_Borough#147, Borough#15) AS route#383, month(cast(tpep_pickup_datetime#55 as date)) AS Month#405]
+- SortMergeJoin [cast(DOLocationID#60 as int)], [LocationID#14], LeftOuter
:- *(5) Sort [cast(DOLocationID#60 as int) ASC NULLS FIRST], false, 0
: +- Exchange hashpartitioning(cast(DOLocationID#60 as int), 200), true, [id=#259]
: +- *(4) Project [tpep_pickup_datetime#55, passenger_count#57, DOLocationID#60, tip_amount#65, Borough#15 AS Pickup_Borough#147]
: +- SortMergeJoin [cast(PULocationID#59 as int)], [LocationID#14], LeftOuter
: :- *(1) Sort [cast(PULocationID#59 as int) ASC NULLS FIRST], false, 0
: : +- Exchange hashpartitioning(cast(PULocationID#59 as int), 200), true, [id=#228]
: : +- FileScan csv [tpep_pickup_datetime#55,passenger_count#57,PULocationID#59,DOLocationID#60,tip_amount#65] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/yellow_tripdata/2023/yellow_tripdata_..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<tpep_pickup_datetime:string,passenger_count:string,PULocationID:string,DOLocationID:string...
: +- *(3) Sort [LocationID#14 ASC NULLS FIRST], false, 0
: +- Exchange hashpartitioning(LocationID#14, 200), true, [id=#251]
: +- *(2) Project [LocationID#14, Borough#15]
: +- *(2) Filter isnotnull(LocationID#14)
: +- *(2) Scan ExistingRDD[LocationID#14,Borough#15,Zone#16,service_zone#17]
+- *(7) Sort [LocationID#14 ASC NULLS FIRST], false, 0
+- ReusedExchange [LocationID#14, Borough#15], Exchange hashpartitioning(LocationID#14, 200), true, [id=#251]