graphframes - Details for Query 7

Details for Query 7

Submitted Time: 2024/12/20 23:05:31
Duration: 2 s
Succeeded Jobs: 14 15 16

Show the Stage ID and Task ID that corresponds to the max metric

digraph G { subgraph cluster0 { isCluster="true"; label="WholeStageCodegen (4)\n \nduration: 1 ms"; 1 [labelType="html" label="HashAggregate time in aggregation build: 1 ms number of output rows: 1"]; } 2 [labelType="html" label="Exchange shuffle records written: 3 shuffle write time total (min, med, max (stageId: taskId)) 2 ms (0 ms, 0 ms, 1 ms (stage 17.0: task 21)) records read: 3 local bytes read: 118.0 B fetch wait time: 0 ms remote bytes read: 59.0 B local blocks read: 2 remote blocks read: 1 data size total (min, med, max (stageId: taskId)) 48.0 B (16.0 B, 16.0 B, 16.0 B (stage 17.0: task 22)) shuffle bytes written total (min, med, max (stageId: taskId)) 177.0 B (59.0 B, 59.0 B, 59.0 B (stage 17.0: task 22))"]; subgraph cluster3 { isCluster="true"; label="WholeStageCodegen (3)\n \nduration: total (min, med, max (stageId: taskId))\n2.3 s (232 ms, 889 ms, 1.2 s (stage 17.0: task 21))"; 4 [labelType="html" label="HashAggregate time in aggregation build total (min, med, max (stageId: taskId)) 2.1 s (231 ms, 820 ms, 1.1 s (stage 17.0: task 21)) number of output rows: 3"]; 5 [labelType="html" label=" Project "]; 6 [labelType="html" label="BroadcastHashJoin number of output rows: 285,732"]; 7 [labelType="html" label="BroadcastHashJoin number of output rows: 466,523"]; 8 [labelType="html" label=" Project "]; } 9 [labelType="html" label="Scan csv number of files read: 7 metadata time: 0 ms size of files read: 47.0 MiB number of output rows: 466,523"]; 10 [labelType="html" label="BroadcastExchange data size: 1026.1 KiB time to collect: 178 ms time to build: 8 ms time to broadcast: 3 ms"]; subgraph cluster11 { isCluster="true"; label="WholeStageCodegen (1)\n \nduration: 76 ms"; 12 [labelType="html" label=" Project "]; } 13 [labelType="html" label="Scan csv number of files read: 1 metadata time: 0 ms size of files read: 10.5 KiB number of output rows: 265"]; 14 [labelType="html" label="BroadcastExchange data size: 1040.0 KiB time to collect: 165 ms time to build: 14 ms time to broadcast: 5 ms"]; subgraph cluster15 { isCluster="true"; label="WholeStageCodegen (2)\n \nduration: 60 ms"; 16 [labelType="html" label=" Project "]; } 17 [labelType="html" label="Scan csv number of files read: 1 metadata time: 0 ms size of files read: 10.5 KiB number of output rows: 265"]; 2->1; 4->2; 5->4; 6->5; 7->6; 8->7; 9->8; 10->7; 12->10; 13->12; 14->6; 16->14; 17->16; }

HashAggregate(keys=[], functions=[count(1)])

WholeStageCodegen (4)

Exchange SinglePartition, true, [id=#293]

HashAggregate(keys=[], functions=[partial_count(1)])

Project

BroadcastHashJoin [src#123.Borough, src#123.service_zone, edge#121.dst], [dst#125.Borough, dst#125.service_zone, dst#125.id], Inner, BuildRight

BroadcastHashJoin [edge#121.src], [src#123.id], Inner, BuildRight

Project [struct(src, PULocationID#18, dst, DOLocationID#19) AS edge#121]

WholeStageCodegen (3)

FileScan csv [PULocationID#18,DOLocationID#19] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/green_tripdata/2023/green_tripdata_20..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<PULocationID:int,DOLocationID:int>

BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id as bigint))), [id=#281]

Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]

WholeStageCodegen (1)

FileScan csv [LocationID#72,Borough#73,Zone#74,service_zone#75] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/taxi_zone_lookup.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<LocationID:int,Borough:string,Zone:string,service_zone:string>

BroadcastExchange HashedRelationBroadcastMode(List(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].Borough, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].service_zone, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id)), [id=#287]

Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]

WholeStageCodegen (2)

Details

== Parsed Logical Plan ==
Aggregate [count(1) AS count#243L]
+- Project [src#123.id AS src.id#199, dst#125.id AS dst.id#200, src#123.Borough AS Borough#201, src#123.service_zone AS service_zone#202]
   +- Filter ((src#123.Borough = dst#125.Borough) AND (src#123.service_zone = dst#125.service_zone))
      +- Project [src#123, edge#121, dst#125]
         +- Join Inner, (edge#121.dst = dst#125.id)
            :- Join Inner, (edge#121.src = src#123.id)
            :  :- Project [struct(src, src#85, dst, dst#86) AS edge#121]
            :  :  +- Project [PULocationID#18 AS src#85, DOLocationID#19 AS dst#86]
            :  :     +- Relation[lpep_pickup_datetime#16,lpep_dropoff_datetime#17,PULocationID#18,DOLocationID#19,passenger_count#20,trip_distance#21,fare_amount#22,extra#23,mta_tax#24,tip_amount#25,tolls_amount#26,ehail_fee#27,total_amount#28,payment_type#29,trip_type#30,congestion_surcharge#31,taxi_type#32] csv
            :  +- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
            :     +- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
            :        +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
            +- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
               +- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
                  +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv

== Analyzed Logical Plan ==
count: bigint
Aggregate [count(1) AS count#243L]
+- Project [src#123.id AS src.id#199, dst#125.id AS dst.id#200, src#123.Borough AS Borough#201, src#123.service_zone AS service_zone#202]
   +- Filter ((src#123.Borough = dst#125.Borough) AND (src#123.service_zone = dst#125.service_zone))
      +- Project [src#123, edge#121, dst#125]
         +- Join Inner, (edge#121.dst = dst#125.id)
            :- Join Inner, (edge#121.src = src#123.id)
            :  :- Project [struct(src, src#85, dst, dst#86) AS edge#121]
            :  :  +- Project [PULocationID#18 AS src#85, DOLocationID#19 AS dst#86]
            :  :     +- Relation[lpep_pickup_datetime#16,lpep_dropoff_datetime#17,PULocationID#18,DOLocationID#19,passenger_count#20,trip_distance#21,fare_amount#22,extra#23,mta_tax#24,tip_amount#25,tolls_amount#26,ehail_fee#27,total_amount#28,payment_type#29,trip_type#30,congestion_surcharge#31,taxi_type#32] csv
            :  +- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
            :     +- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
            :        +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
            +- Project [struct(id, id#80, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
               +- Project [LocationID#72 AS id#80, Borough#73, Zone#74, service_zone#75]
                  +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv

== Optimized Logical Plan ==
Aggregate [count(1) AS count#243L]
+- Project
   +- Join Inner, (((src#123.Borough = dst#125.Borough) AND (src#123.service_zone = dst#125.service_zone)) AND (edge#121.dst = dst#125.id))
      :- Join Inner, (edge#121.src = src#123.id)
      :  :- Project [struct(src, PULocationID#18, dst, DOLocationID#19) AS edge#121]
      :  :  +- Relation[lpep_pickup_datetime#16,lpep_dropoff_datetime#17,PULocationID#18,DOLocationID#19,passenger_count#20,trip_distance#21,fare_amount#22,extra#23,mta_tax#24,tip_amount#25,tolls_amount#26,ehail_fee#27,total_amount#28,payment_type#29,trip_type#30,congestion_surcharge#31,taxi_type#32] csv
      :  +- Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
      :     +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv
      +- Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
         +- Relation[LocationID#72,Borough#73,Zone#74,service_zone#75] csv

== Physical Plan ==
*(4) HashAggregate(keys=[], functions=[count(1)], output=[count#243L])
+- Exchange SinglePartition, true, [id=#293]
   +- *(3) HashAggregate(keys=[], functions=[partial_count(1)], output=[count#246L])
      +- *(3) Project
         +- *(3) BroadcastHashJoin [src#123.Borough, src#123.service_zone, edge#121.dst], [dst#125.Borough, dst#125.service_zone, dst#125.id], Inner, BuildRight
            :- *(3) BroadcastHashJoin [edge#121.src], [src#123.id], Inner, BuildRight
            :  :- *(3) Project [struct(src, PULocationID#18, dst, DOLocationID#19) AS edge#121]
            :  :  +- FileScan csv [PULocationID#18,DOLocationID#19] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/green_tripdata/2023/green_tripdata_20..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<PULocationID:int,DOLocationID:int>
            :  +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id as bigint))), [id=#281]
            :     +- *(1) Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS src#123]
            :        +- FileScan csv [LocationID#72,Borough#73,Zone#74,service_zone#75] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/taxi_zone_lookup.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<LocationID:int,Borough:string,Zone:string,service_zone:string>
            +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].Borough, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].service_zone, input[0, struct<id:int,Borough:string,Zone:string,service_zone:string>, false].id)), [id=#287]
               +- *(2) Project [struct(id, LocationID#72, Borough, Borough#73, Zone, Zone#74, service_zone, service_zone#75) AS dst#125]
                  +- FileScan csv [LocationID#72,Borough#73,Zone#74,service_zone#75] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[s3a://data-repository-bkt/ECS765/nyc_taxi/taxi_zone_lookup.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<LocationID:int,Borough:string,Zone:string,service_zone:string>