digraph G {
subgraph cluster0 {
isCluster="true";
label="WholeStageCodegen (4)\n \nduration: 1 ms";
1 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build: 1 ms<br>number of output rows: 1"];
}
2 [labelType="html" label="<b>Exchange</b><br><br>shuffle records written: 3<br>shuffle write time total (min, med, max (stageId: taskId))<br>2 ms (0 ms, 0 ms, 1 ms (stage 17.0: task 21))<br>records read: 3<br>local bytes read: 118.0 B<br>fetch wait time: 0 ms<br>remote bytes read: 59.0 B<br>local blocks read: 2<br>remote blocks read: 1<br>data size total (min, med, max (stageId: taskId))<br>48.0 B (16.0 B, 16.0 B, 16.0 B (stage 17.0: task 22))<br>shuffle bytes written total (min, med, max (stageId: taskId))<br>177.0 B (59.0 B, 59.0 B, 59.0 B (stage 17.0: task 22))"];
subgraph cluster3 {
isCluster="true";
label="WholeStageCodegen (3)\n \nduration: total (min, med, max (stageId: taskId))\n2.3 s (232 ms, 889 ms, 1.2 s (stage 17.0: task 21))";
4 [labelType="html" label="<b>HashAggregate</b><br><br>time in aggregation build total (min, med, max (stageId: taskId))<br>2.1 s (231 ms, 820 ms, 1.1 s (stage 17.0: task 21))<br>number of output rows: 3"];
5 [labelType="html" label="<br><b>Project</b><br><br>"];
6 [labelType="html" label="<b>BroadcastHashJoin</b><br><br>number of output rows: 285,732"];
7 [labelType="html" label="<b>BroadcastHashJoin</b><br><br>number of output rows: 466,523"];
8 [labelType="html" label="<br><b>Project</b><br><br>"];
}
9 [labelType="html" label="<b>Scan csv </b><br><br>number of files read: 7<br>metadata time: 0 ms<br>size of files read: 47.0 MiB<br>number of output rows: 466,523"];
10 [labelType="html" label="<b>BroadcastExchange</b><br><br>data size: 1026.1 KiB<br>time to collect: 178 ms<br>time to build: 8 ms<br>time to broadcast: 3 ms"];
subgraph cluster11 {
isCluster="true";
label="WholeStageCodegen (1)\n \nduration: 76 ms";
12 [labelType="html" label="<br><b>Project</b><br><br>"];
}
13 [labelType="html" label="<b>Scan csv </b><br><br>number of files read: 1<br>metadata time: 0 ms<br>size of files read: 10.5 KiB<br>number of output rows: 265"];
14 [labelType="html" label="<b>BroadcastExchange</b><br><br>data size: 1040.0 KiB<br>time to collect: 165 ms<br>time to build: 14 ms<br>time to broadcast: 5 ms"];
subgraph cluster15 {
isCluster="true";
label="WholeStageCodegen (2)\n \nduration: 60 ms";
16 [labelType="html" label="<br><b>Project</b><br><br>"];
}
17 [labelType="html" label="<b>Scan csv </b><br><br>number of files read: 1<br>metadata time: 0 ms<br>size of files read: 10.5 KiB<br>number of output rows: 265"];
2->1;
4->2;
5->4;
6->5;
7->6;
8->7;
9->8;
10->7;
12->10;
13->12;
14->6;
16->14;
17->16;
}
18
HashAggregate(keys=[], functions=[count(1)])
WholeStageCodegen (4)
Exchange SinglePartition, true, [id=#293]
HashAggregate(keys=[], functions=[partial_count(1)])
Project
BroadcastHashJoin [src#123.Borough, src#123.service_zone, edge#121.dst], [dst#125.Borough, dst#125.service_zone, dst#125.id], Inner, BuildRight
BroadcastHashJoin [edge#121.src], [src#123.id], Inner, BuildRight
Project [struct(src, PULocationID#18, dst, DOLocationID#19) AS edge#121]
WholeStageCodegen (3)
FileScan csv [PULocationID#18,DOLocationID#19] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/green_tripdata/2023/green_tripdata_20..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<PULocationID:int,DOLocationID:int>
BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id as bigint))), [id=#281]
Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
WholeStageCodegen (1)
FileScan csv [LocationID#72,Borough#73,Zone#74,service_zone#75] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/taxi_zone_lookup.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<LocationID:int,Borough:string,Zone:string,service_zone:string>
BroadcastExchange HashedRelationBroadcastMode(List(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].Borough, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].service_zone, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id)), [id=#287]
Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
WholeStageCodegen (2)
FileScan csv [LocationID#72,Borough#73,Zone#74,service_zone#75] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/taxi_zone_lookup.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<LocationID:int,Borough:string,Zone:string,service_zone:string>
== Parsed Logical Plan ==
Aggregate [count(1) AS count#243L]
+- Project [src#123.id AS src.id#199, dst#125.id AS dst.id#200, src#123.Borough AS Borough#201, src#123.service_zone AS service_zone#202]
+- Filter ((src#123.Borough = dst#125.Borough) AND (src#123.service_zone = dst#125.service_zone))
+- Project [src#123, edge#121, dst#125]
+- Join Inner, (edge#121.dst = dst#125.id)
:- Join Inner, (edge#121.src = src#123.id)
: :- Project [struct(src, src#85, dst, dst#86) AS edge#121]
: : +- Project [PULocationID#18 AS src#85, DOLocationID#19 AS dst#86]
: : +- Relation[lpep_pickup_datetime#16,lpep_dropoff_datetime#17,PULocationID#18,DOLocationID#19,passenger_count#20,trip_distance#21,fare_amount#22,extra#23,mta_tax#24,tip_amount#25,tolls_amount#26,ehail_fee#27,total_amount#28,payment_type#29,trip_type#30,congestion_surcharge#31,taxi_type#32] csv
: +- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
: +- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
: +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
+- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
+- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
+- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
== Analyzed Logical Plan ==
count: bigint
Aggregate [count(1) AS count#243L]
+- Project [src#123.id AS src.id#199, dst#125.id AS dst.id#200, src#123.Borough AS Borough#201, src#123.service_zone AS service_zone#202]
+- Filter ((src#123.Borough = dst#125.Borough) AND (src#123.service_zone = dst#125.service_zone))
+- Project [src#123, edge#121, dst#125]
+- Join Inner, (edge#121.dst = dst#125.id)
:- Join Inner, (edge#121.src = src#123.id)
: :- Project [struct(src, src#85, dst, dst#86) AS edge#121]
: : +- Project [PULocationID#18 AS src#85, DOLocationID#19 AS dst#86]
: : +- Relation[lpep_pickup_datetime#16,lpep_dropoff_datetime#17,PULocationID#18,DOLocationID#19,passenger_count#20,trip_distance#21,fare_amount#22,extra#23,mta_tax#24,tip_amount#25,tolls_amount#26,ehail_fee#27,total_amount#28,payment_type#29,trip_type#30,congestion_surcharge#31,taxi_type#32] csv
: +- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
: +- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
: +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
+- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
+- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
+- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
== Optimized Logical Plan ==
Aggregate [count(1) AS count#243L]
+- Project
+- Join Inner, (((src#123.Borough = dst#125.Borough) AND (src#123.service_zone = dst#125.service_zone)) AND (edge#121.dst = dst#125.id))
:- Join Inner, (edge#121.src = src#123.id)
: :- Project [struct(src, PULocationID#18, dst, DOLocationID#19) AS edge#121]
: : +- Relation[lpep_pickup_datetime#16,lpep_dropoff_datetime#17,PULocationID#18,DOLocationID#19,passenger_count#20,trip_distance#21,fare_amount#22,extra#23,mta_tax#24,tip_amount#25,tolls_amount#26,ehail_fee#27,total_amount#28,payment_type#29,trip_type#30,congestion_surcharge#31,taxi_type#32] csv
: +- Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
: +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
+- Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
+- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
== Physical Plan ==
*(4) HashAggregate(keys=[], functions=[count(1)], output=[count#243L])
+- Exchange SinglePartition, true, [id=#293]
+- *(3) HashAggregate(keys=[], functions=[partial_count(1)], output=[count#246L])
+- *(3) Project
+- *(3) BroadcastHashJoin [src#123.Borough, src#123.service_zone, edge#121.dst], [dst#125.Borough, dst#125.service_zone, dst#125.id], Inner, BuildRight
:- *(3) BroadcastHashJoin [edge#121.src], [src#123.id], Inner, BuildRight
: :- *(3) Project [struct(src, PULocationID#18, dst, DOLocationID#19) AS edge#121]
: : +- FileScan csv [PULocationID#18,DOLocationID#19] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/green_tripdata/2023/green_tripdata_20..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<PULocationID:int,DOLocationID:int>
: +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id as bigint))), [id=#281]
: +- *(1) Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
: +- FileScan csv [LocationID#72,Borough#73,Zone#74,service_zone#75] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/taxi_zone_lookup.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<LocationID:int,Borough:string,Zone:string,service_zone:string>
+- BroadcastExchange HashedRelationBroadcastMode(List(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].Borough, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].service_zone, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id)), [id=#287]
+- *(2) Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
+- FileScan csv [LocationID#72,Borough#73,Zone#74,service_zone#75] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/taxi_zone_lookup.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<LocationID:int,Borough:string,Zone:string,service_zone:string>