diff --git a/python/pyspark/sql/connect/proto/relations_pb2.py b/python/pyspark/sql/connect/proto/relations_pb2.py index 3774bcbdbfb0e..9f008b756de22 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.py +++ b/python/pyspark/sql/connect/proto/relations_pb2.py @@ -43,7 +43,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto\x1a\x1aspark/connect/common.proto\x1a\x1dspark/connect/ml_common.proto"\x9c\x1d\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12*\n\x05to_df\x18\x12 \x01(\x0b\x32\x13.spark.connect.ToDFH\x00R\x04toDf\x12U\n\x14with_columns_renamed\x18\x13 \x01(\x0b\x32!.spark.connect.WithColumnsRenamedH\x00R\x12withColumnsRenamed\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12)\n\x04\x64rop\x18\x15 \x01(\x0b\x32\x13.spark.connect.DropH\x00R\x04\x64rop\x12)\n\x04tail\x18\x16 \x01(\x0b\x32\x13.spark.connect.TailH\x00R\x04tail\x12?\n\x0cwith_columns\x18\x17 \x01(\x0b\x32\x1a.spark.connect.WithColumnsH\x00R\x0bwithColumns\x12)\n\x04hint\x18\x18 \x01(\x0b\x32\x13.spark.connect.HintH\x00R\x04hint\x12\x32\n\x07unpivot\x18\x19 \x01(\x0b\x32\x16.spark.connect.UnpivotH\x00R\x07unpivot\x12\x36\n\tto_schema\x18\x1a \x01(\x0b\x32\x17.spark.connect.ToSchemaH\x00R\x08toSchema\x12\x64\n\x19repartition_by_expression\x18\x1b \x01(\x0b\x32&.spark.connect.RepartitionByExpressionH\x00R\x17repartitionByExpression\x12\x45\n\x0emap_partitions\x18\x1c \x01(\x0b\x32\x1c.spark.connect.MapPartitionsH\x00R\rmapPartitions\x12H\n\x0f\x63ollect_metrics\x18\x1d \x01(\x0b\x32\x1d.spark.connect.CollectMetricsH\x00R\x0e\x63ollectMetrics\x12,\n\x05parse\x18\x1e \x01(\x0b\x32\x14.spark.connect.ParseH\x00R\x05parse\x12\x36\n\tgroup_map\x18\x1f \x01(\x0b\x32\x17.spark.connect.GroupMapH\x00R\x08groupMap\x12=\n\x0c\x63o_group_map\x18 \x01(\x0b\x32\x19.spark.connect.CoGroupMapH\x00R\ncoGroupMap\x12\x45\n\x0ewith_watermark\x18! \x01(\x0b\x32\x1c.spark.connect.WithWatermarkH\x00R\rwithWatermark\x12\x63\n\x1a\x61pply_in_pandas_with_state\x18" \x01(\x0b\x32%.spark.connect.ApplyInPandasWithStateH\x00R\x16\x61pplyInPandasWithState\x12<\n\x0bhtml_string\x18# \x01(\x0b\x32\x19.spark.connect.HtmlStringH\x00R\nhtmlString\x12X\n\x15\x63\x61\x63hed_local_relation\x18$ \x01(\x0b\x32".spark.connect.CachedLocalRelationH\x00R\x13\x63\x61\x63hedLocalRelation\x12[\n\x16\x63\x61\x63hed_remote_relation\x18% \x01(\x0b\x32#.spark.connect.CachedRemoteRelationH\x00R\x14\x63\x61\x63hedRemoteRelation\x12\x8e\x01\n)common_inline_user_defined_table_function\x18& \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R$commonInlineUserDefinedTableFunction\x12\x37\n\nas_of_join\x18\' \x01(\x0b\x32\x17.spark.connect.AsOfJoinH\x00R\x08\x61sOfJoin\x12\x85\x01\n&common_inline_user_defined_data_source\x18( \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R!commonInlineUserDefinedDataSource\x12\x45\n\x0ewith_relations\x18) \x01(\x0b\x32\x1c.spark.connect.WithRelationsH\x00R\rwithRelations\x12\x38\n\ttranspose\x18* \x01(\x0b\x32\x18.spark.connect.TransposeH\x00R\ttranspose\x12w\n unresolved_table_valued_function\x18+ \x01(\x0b\x32,.spark.connect.UnresolvedTableValuedFunctionH\x00R\x1dunresolvedTableValuedFunction\x12?\n\x0clateral_join\x18, \x01(\x0b\x32\x1a.spark.connect.LateralJoinH\x00R\x0blateralJoin\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x30\n\x07\x64rop_na\x18[ \x01(\x0b\x32\x15.spark.connect.NADropH\x00R\x06\x64ropNa\x12\x34\n\x07replace\x18\\ \x01(\x0b\x32\x18.spark.connect.NAReplaceH\x00R\x07replace\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x39\n\x08\x64\x65scribe\x18\x66 \x01(\x0b\x32\x1b.spark.connect.StatDescribeH\x00R\x08\x64\x65scribe\x12*\n\x03\x63ov\x18g \x01(\x0b\x32\x16.spark.connect.StatCovH\x00R\x03\x63ov\x12-\n\x04\x63orr\x18h \x01(\x0b\x32\x17.spark.connect.StatCorrH\x00R\x04\x63orr\x12L\n\x0f\x61pprox_quantile\x18i \x01(\x0b\x32!.spark.connect.StatApproxQuantileH\x00R\x0e\x61pproxQuantile\x12=\n\nfreq_items\x18j \x01(\x0b\x32\x1c.spark.connect.StatFreqItemsH\x00R\tfreqItems\x12:\n\tsample_by\x18k \x01(\x0b\x32\x1b.spark.connect.StatSampleByH\x00R\x08sampleBy\x12\x33\n\x07\x63\x61talog\x18\xc8\x01 \x01(\x0b\x32\x16.spark.connect.CatalogH\x00R\x07\x63\x61talog\x12=\n\x0bml_relation\x18\xac\x02 \x01(\x0b\x32\x19.spark.connect.MlRelationH\x00R\nmlRelation\x12\x35\n\textension\x18\xe6\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\xe4\x03\n\nMlRelation\x12\x43\n\ttransform\x18\x01 \x01(\x0b\x32#.spark.connect.MlRelation.TransformH\x00R\ttransform\x12,\n\x05\x66\x65tch\x18\x02 \x01(\x0b\x32\x14.spark.connect.FetchH\x00R\x05\x66\x65tch\x12P\n\x15model_summary_dataset\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationH\x01R\x13modelSummaryDataset\x88\x01\x01\x1a\xeb\x01\n\tTransform\x12\x33\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefH\x00R\x06objRef\x12=\n\x0btransformer\x18\x02 \x01(\x0b\x32\x19.spark.connect.MlOperatorH\x00R\x0btransformer\x12-\n\x05input\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06params\x18\x04 \x01(\x0b\x32\x17.spark.connect.MlParamsR\x06paramsB\n\n\x08operatorB\t\n\x07ml_typeB\x18\n\x16_model_summary_dataset"\xcb\x02\n\x05\x46\x65tch\x12\x31\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefR\x06objRef\x12\x35\n\x07methods\x18\x02 \x03(\x0b\x32\x1b.spark.connect.Fetch.MethodR\x07methods\x1a\xd7\x01\n\x06Method\x12\x16\n\x06method\x18\x01 \x01(\tR\x06method\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32 .spark.connect.Fetch.Method.ArgsR\x04\x61rgs\x1a\x7f\n\x04\x41rgs\x12\x39\n\x05param\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x05param\x12/\n\x05input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x05inputB\x0b\n\targs_type"\t\n\x07Unknown"\x8e\x01\n\x0eRelationCommon\x12#\n\x0bsource_info\x18\x01 \x01(\tB\x02\x18\x01R\nsourceInfo\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12-\n\x06origin\x18\x03 \x01(\x0b\x32\x15.spark.connect.OriginR\x06originB\n\n\x08_plan_id"\xde\x03\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32\x1c.spark.connect.SQL.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12O\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32&.spark.connect.SQL.NamedArgumentsEntryR\x0enamedArguments\x12>\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cposArguments\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"u\n\rWithRelations\x12+\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04root\x12\x37\n\nreferences\x18\x02 \x03(\x0b\x32\x17.spark.connect.RelationR\nreferences"\x97\x05\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x12!\n\x0cis_streaming\x18\x03 \x01(\x08R\x0bisStreaming\x1a\xc0\x01\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x45\n\x07options\x18\x02 \x03(\x0b\x32+.spark.connect.Read.NamedTable.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x95\x02\n\nDataSource\x12\x1b\n\x06\x66ormat\x18\x01 \x01(\tH\x00R\x06\x66ormat\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x12\x14\n\x05paths\x18\x04 \x03(\tR\x05paths\x12\x1e\n\npredicates\x18\x05 \x03(\tR\npredicates\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_formatB\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\x95\x05\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns\x12K\n\x0ejoin_data_type\x18\x06 \x01(\x0b\x32 .spark.connect.Join.JoinDataTypeH\x00R\x0cjoinDataType\x88\x01\x01\x1a\\\n\x0cJoinDataType\x12$\n\x0eis_left_struct\x18\x01 \x01(\x08R\x0cisLeftStruct\x12&\n\x0fis_right_struct\x18\x02 \x01(\x08R\risRightStruct"\xd0\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06\x12\x13\n\x0fJOIN_TYPE_CROSS\x10\x07\x42\x11\n\x0f_join_data_type"\xdf\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01\x12\x37\n\x15\x61llow_missing_columns\x18\x06 \x01(\x08H\x02R\x13\x61llowMissingColumns\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_nameB\x18\n\x16_allow_missing_columns"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"K\n\x04Tail\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"\xfe\x05\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x41\n\ngroup_type\x18\x02 \x01(\x0e\x32".spark.connect.Aggregate.GroupTypeR\tgroupType\x12L\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12N\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x14\x61ggregateExpressions\x12\x34\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.spark.connect.Aggregate.PivotR\x05pivot\x12J\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.spark.connect.Aggregate.GroupingSetsR\x0cgroupingSets\x1ao\n\x05Pivot\x12+\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1aL\n\x0cGroupingSets\x12<\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0bgroupingSet"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05"\xa0\x01\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x05order\x18\x02 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\x05order\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x42\x0c\n\n_is_global"\x8d\x01\n\x04\x44rop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x33\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07\x63olumns\x12!\n\x0c\x63olumn_names\x18\x03 \x03(\tR\x0b\x63olumnNames"\xf0\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x12.\n\x10within_watermark\x18\x04 \x01(\x08H\x01R\x0fwithinWatermark\x88\x01\x01\x42\x16\n\x14_all_columns_as_keysB\x13\n\x11_within_watermark"Y\n\rLocalRelation\x12\x17\n\x04\x64\x61ta\x18\x01 \x01(\x0cH\x00R\x04\x64\x61ta\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x42\x07\n\x05_dataB\t\n\x07_schema"H\n\x13\x43\x61\x63hedLocalRelation\x12\x12\n\x04hash\x18\x03 \x01(\tR\x04hashJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03R\x06userIdR\tsessionId"7\n\x14\x43\x61\x63hedRemoteRelation\x12\x1f\n\x0brelation_id\x18\x01 \x01(\tR\nrelationId"\x91\x02\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x12/\n\x13\x64\x65terministic_order\x18\x06 \x01(\x08R\x12\x64\x65terministicOrderB\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8e\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"r\n\nHtmlString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"Q\n\x0cStatDescribe\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"`\n\x07StatCov\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x89\x01\n\x08StatCorr\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2\x12\x1b\n\x06method\x18\x04 \x01(\tH\x00R\x06method\x88\x01\x01\x42\t\n\x07_method"\xa4\x01\n\x12StatApproxQuantile\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12$\n\rprobabilities\x18\x03 \x03(\x01R\rprobabilities\x12%\n\x0erelative_error\x18\x04 \x01(\x01R\rrelativeError"}\n\rStatFreqItems\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x1d\n\x07support\x18\x03 \x01(\x01H\x00R\x07support\x88\x01\x01\x42\n\n\x08_support"\xb5\x02\n\x0cStatSampleBy\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03\x63ol\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x42\n\tfractions\x18\x03 \x03(\x0b\x32$.spark.connect.StatSampleBy.FractionR\tfractions\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x00R\x04seed\x88\x01\x01\x1a\x63\n\x08\x46raction\x12;\n\x07stratum\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x07stratum\x12\x1a\n\x08\x66raction\x18\x02 \x01(\x01R\x08\x66ractionB\x07\n\x05_seed"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"\x86\x01\n\x06NADrop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\'\n\rmin_non_nulls\x18\x03 \x01(\x05H\x00R\x0bminNonNulls\x88\x01\x01\x42\x10\n\x0e_min_non_nulls"\xa8\x02\n\tNAReplace\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12H\n\x0creplacements\x18\x03 \x03(\x0b\x32$.spark.connect.NAReplace.ReplacementR\x0creplacements\x1a\x8d\x01\n\x0bReplacement\x12>\n\told_value\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08oldValue\x12>\n\tnew_value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08newValue"X\n\x04ToDF\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\xfe\x02\n\x12WithColumnsRenamed\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12i\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x37.spark.connect.WithColumnsRenamed.RenameColumnsMapEntryB\x02\x18\x01R\x10renameColumnsMap\x12\x42\n\x07renames\x18\x03 \x03(\x0b\x32(.spark.connect.WithColumnsRenamed.RenameR\x07renames\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x45\n\x06Rename\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12 \n\x0cnew_col_name\x18\x02 \x01(\tR\nnewColName"w\n\x0bWithColumns\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x07\x61liases\x18\x02 \x03(\x0b\x32\x1f.spark.connect.Expression.AliasR\x07\x61liases"\x86\x01\n\rWithWatermark\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\nevent_time\x18\x02 \x01(\tR\teventTime\x12\'\n\x0f\x64\x65lay_threshold\x18\x03 \x01(\tR\x0e\x64\x65layThreshold"\x84\x01\n\x04Hint\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x39\n\nparameters\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\nparameters"\xc7\x02\n\x07Unpivot\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03ids\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x03ids\x12:\n\x06values\x18\x03 \x01(\x0b\x32\x1d.spark.connect.Unpivot.ValuesH\x00R\x06values\x88\x01\x01\x12\x30\n\x14variable_column_name\x18\x04 \x01(\tR\x12variableColumnName\x12*\n\x11value_column_name\x18\x05 \x01(\tR\x0fvalueColumnName\x1a;\n\x06Values\x12\x31\n\x06values\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x06valuesB\t\n\x07_values"z\n\tTranspose\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12>\n\rindex_columns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cindexColumns"}\n\x1dUnresolvedTableValuedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"j\n\x08ToSchema\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06schema\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema"\xcb\x01\n\x17RepartitionByExpression\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x0fpartition_exprs\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0epartitionExprs\x12*\n\x0enum_partitions\x18\x03 \x01(\x05H\x00R\rnumPartitions\x88\x01\x01\x42\x11\n\x0f_num_partitions"\xe8\x01\n\rMapPartitions\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x04\x66unc\x18\x02 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12"\n\nis_barrier\x18\x03 \x01(\x08H\x00R\tisBarrier\x88\x01\x01\x12"\n\nprofile_id\x18\x04 \x01(\x05H\x01R\tprofileId\x88\x01\x01\x42\r\n\x0b_is_barrierB\r\n\x0b_profile_id"\xd2\x06\n\x08GroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12J\n\x13sorting_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x12sortingExpressions\x12<\n\rinitial_input\x18\x05 \x01(\x0b\x32\x17.spark.connect.RelationR\x0cinitialInput\x12[\n\x1cinitial_grouping_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x1ainitialGroupingExpressions\x12;\n\x18is_map_groups_with_state\x18\x07 \x01(\x08H\x00R\x14isMapGroupsWithState\x88\x01\x01\x12$\n\x0boutput_mode\x18\x08 \x01(\tH\x01R\noutputMode\x88\x01\x01\x12&\n\x0ctimeout_conf\x18\t \x01(\tH\x02R\x0btimeoutConf\x88\x01\x01\x12?\n\x0cstate_schema\x18\n \x01(\x0b\x32\x17.spark.connect.DataTypeH\x03R\x0bstateSchema\x88\x01\x01\x12\x65\n\x19transform_with_state_info\x18\x0b \x01(\x0b\x32%.spark.connect.TransformWithStateInfoH\x04R\x16transformWithStateInfo\x88\x01\x01\x42\x1b\n\x19_is_map_groups_with_stateB\x0e\n\x0c_output_modeB\x0f\n\r_timeout_confB\x0f\n\r_state_schemaB\x1c\n\x1a_transform_with_state_info"\xdf\x01\n\x16TransformWithStateInfo\x12\x1b\n\ttime_mode\x18\x01 \x01(\tR\x08timeMode\x12\x38\n\x16\x65vent_time_column_name\x18\x02 \x01(\tH\x00R\x13\x65ventTimeColumnName\x88\x01\x01\x12\x41\n\routput_schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x01R\x0coutputSchema\x88\x01\x01\x42\x19\n\x17_event_time_column_nameB\x10\n\x0e_output_schema"\x8e\x04\n\nCoGroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12W\n\x1ainput_grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18inputGroupingExpressions\x12-\n\x05other\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05other\x12W\n\x1aother_grouping_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18otherGroupingExpressions\x12\x42\n\x04\x66unc\x18\x05 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12U\n\x19input_sorting_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17inputSortingExpressions\x12U\n\x19other_sorting_expressions\x18\x07 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17otherSortingExpressions"\xe5\x02\n\x16\x41pplyInPandasWithState\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12#\n\routput_schema\x18\x04 \x01(\tR\x0coutputSchema\x12!\n\x0cstate_schema\x18\x05 \x01(\tR\x0bstateSchema\x12\x1f\n\x0boutput_mode\x18\x06 \x01(\tR\noutputMode\x12!\n\x0ctimeout_conf\x18\x07 \x01(\tR\x0btimeoutConf"\xf4\x01\n$CommonInlineUserDefinedTableFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12<\n\x0bpython_udtf\x18\x04 \x01(\x0b\x32\x19.spark.connect.PythonUDTFH\x00R\npythonUdtfB\n\n\x08\x66unction"\xb1\x01\n\nPythonUDTF\x12=\n\x0breturn_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\nreturnType\x88\x01\x01\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVerB\x0e\n\x0c_return_type"\x97\x01\n!CommonInlineUserDefinedDataSource\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12O\n\x12python_data_source\x18\x02 \x01(\x0b\x32\x1f.spark.connect.PythonDataSourceH\x00R\x10pythonDataSourceB\r\n\x0b\x64\x61ta_source"K\n\x10PythonDataSource\x12\x18\n\x07\x63ommand\x18\x01 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x02 \x01(\tR\tpythonVer"\x88\x01\n\x0e\x43ollectMetrics\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x33\n\x07metrics\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07metrics"\x84\x03\n\x05Parse\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x38\n\x06\x66ormat\x18\x02 \x01(\x0e\x32 .spark.connect.Parse.ParseFormatR\x06\x66ormat\x12\x34\n\x06schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x06schema\x88\x01\x01\x12;\n\x07options\x18\x04 \x03(\x0b\x32!.spark.connect.Parse.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"X\n\x0bParseFormat\x12\x1c\n\x18PARSE_FORMAT_UNSPECIFIED\x10\x00\x12\x14\n\x10PARSE_FORMAT_CSV\x10\x01\x12\x15\n\x11PARSE_FORMAT_JSON\x10\x02\x42\t\n\x07_schema"\xdb\x03\n\x08\x41sOfJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12\x37\n\nleft_as_of\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08leftAsOf\x12\x39\n\x0bright_as_of\x18\x04 \x01(\x0b\x32\x19.spark.connect.ExpressionR\trightAsOf\x12\x36\n\tjoin_expr\x18\x05 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08joinExpr\x12#\n\rusing_columns\x18\x06 \x03(\tR\x0cusingColumns\x12\x1b\n\tjoin_type\x18\x07 \x01(\tR\x08joinType\x12\x37\n\ttolerance\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\ttolerance\x12.\n\x13\x61llow_exact_matches\x18\t \x01(\x08R\x11\x61llowExactMatches\x12\x1c\n\tdirection\x18\n \x01(\tR\tdirection"\xe6\x01\n\x0bLateralJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinTypeB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' + b'\n\x1dspark/connect/relations.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x19spark/connect/types.proto\x1a\x1bspark/connect/catalog.proto\x1a\x1aspark/connect/common.proto\x1a\x1dspark/connect/ml_common.proto"\xcc\x1d\n\x08Relation\x12\x35\n\x06\x63ommon\x18\x01 \x01(\x0b\x32\x1d.spark.connect.RelationCommonR\x06\x63ommon\x12)\n\x04read\x18\x02 \x01(\x0b\x32\x13.spark.connect.ReadH\x00R\x04read\x12\x32\n\x07project\x18\x03 \x01(\x0b\x32\x16.spark.connect.ProjectH\x00R\x07project\x12/\n\x06\x66ilter\x18\x04 \x01(\x0b\x32\x15.spark.connect.FilterH\x00R\x06\x66ilter\x12)\n\x04join\x18\x05 \x01(\x0b\x32\x13.spark.connect.JoinH\x00R\x04join\x12\x34\n\x06set_op\x18\x06 \x01(\x0b\x32\x1b.spark.connect.SetOperationH\x00R\x05setOp\x12)\n\x04sort\x18\x07 \x01(\x0b\x32\x13.spark.connect.SortH\x00R\x04sort\x12,\n\x05limit\x18\x08 \x01(\x0b\x32\x14.spark.connect.LimitH\x00R\x05limit\x12\x38\n\taggregate\x18\t \x01(\x0b\x32\x18.spark.connect.AggregateH\x00R\taggregate\x12&\n\x03sql\x18\n \x01(\x0b\x32\x12.spark.connect.SQLH\x00R\x03sql\x12\x45\n\x0elocal_relation\x18\x0b \x01(\x0b\x32\x1c.spark.connect.LocalRelationH\x00R\rlocalRelation\x12/\n\x06sample\x18\x0c \x01(\x0b\x32\x15.spark.connect.SampleH\x00R\x06sample\x12/\n\x06offset\x18\r \x01(\x0b\x32\x15.spark.connect.OffsetH\x00R\x06offset\x12>\n\x0b\x64\x65\x64uplicate\x18\x0e \x01(\x0b\x32\x1a.spark.connect.DeduplicateH\x00R\x0b\x64\x65\x64uplicate\x12,\n\x05range\x18\x0f \x01(\x0b\x32\x14.spark.connect.RangeH\x00R\x05range\x12\x45\n\x0esubquery_alias\x18\x10 \x01(\x0b\x32\x1c.spark.connect.SubqueryAliasH\x00R\rsubqueryAlias\x12>\n\x0brepartition\x18\x11 \x01(\x0b\x32\x1a.spark.connect.RepartitionH\x00R\x0brepartition\x12*\n\x05to_df\x18\x12 \x01(\x0b\x32\x13.spark.connect.ToDFH\x00R\x04toDf\x12U\n\x14with_columns_renamed\x18\x13 \x01(\x0b\x32!.spark.connect.WithColumnsRenamedH\x00R\x12withColumnsRenamed\x12<\n\x0bshow_string\x18\x14 \x01(\x0b\x32\x19.spark.connect.ShowStringH\x00R\nshowString\x12)\n\x04\x64rop\x18\x15 \x01(\x0b\x32\x13.spark.connect.DropH\x00R\x04\x64rop\x12)\n\x04tail\x18\x16 \x01(\x0b\x32\x13.spark.connect.TailH\x00R\x04tail\x12?\n\x0cwith_columns\x18\x17 \x01(\x0b\x32\x1a.spark.connect.WithColumnsH\x00R\x0bwithColumns\x12)\n\x04hint\x18\x18 \x01(\x0b\x32\x13.spark.connect.HintH\x00R\x04hint\x12\x32\n\x07unpivot\x18\x19 \x01(\x0b\x32\x16.spark.connect.UnpivotH\x00R\x07unpivot\x12\x36\n\tto_schema\x18\x1a \x01(\x0b\x32\x17.spark.connect.ToSchemaH\x00R\x08toSchema\x12\x64\n\x19repartition_by_expression\x18\x1b \x01(\x0b\x32&.spark.connect.RepartitionByExpressionH\x00R\x17repartitionByExpression\x12\x45\n\x0emap_partitions\x18\x1c \x01(\x0b\x32\x1c.spark.connect.MapPartitionsH\x00R\rmapPartitions\x12H\n\x0f\x63ollect_metrics\x18\x1d \x01(\x0b\x32\x1d.spark.connect.CollectMetricsH\x00R\x0e\x63ollectMetrics\x12,\n\x05parse\x18\x1e \x01(\x0b\x32\x14.spark.connect.ParseH\x00R\x05parse\x12\x36\n\tgroup_map\x18\x1f \x01(\x0b\x32\x17.spark.connect.GroupMapH\x00R\x08groupMap\x12=\n\x0c\x63o_group_map\x18 \x01(\x0b\x32\x19.spark.connect.CoGroupMapH\x00R\ncoGroupMap\x12\x45\n\x0ewith_watermark\x18! \x01(\x0b\x32\x1c.spark.connect.WithWatermarkH\x00R\rwithWatermark\x12\x63\n\x1a\x61pply_in_pandas_with_state\x18" \x01(\x0b\x32%.spark.connect.ApplyInPandasWithStateH\x00R\x16\x61pplyInPandasWithState\x12<\n\x0bhtml_string\x18# \x01(\x0b\x32\x19.spark.connect.HtmlStringH\x00R\nhtmlString\x12X\n\x15\x63\x61\x63hed_local_relation\x18$ \x01(\x0b\x32".spark.connect.CachedLocalRelationH\x00R\x13\x63\x61\x63hedLocalRelation\x12[\n\x16\x63\x61\x63hed_remote_relation\x18% \x01(\x0b\x32#.spark.connect.CachedRemoteRelationH\x00R\x14\x63\x61\x63hedRemoteRelation\x12\x8e\x01\n)common_inline_user_defined_table_function\x18& \x01(\x0b\x32\x33.spark.connect.CommonInlineUserDefinedTableFunctionH\x00R$commonInlineUserDefinedTableFunction\x12\x37\n\nas_of_join\x18\' \x01(\x0b\x32\x17.spark.connect.AsOfJoinH\x00R\x08\x61sOfJoin\x12\x85\x01\n&common_inline_user_defined_data_source\x18( \x01(\x0b\x32\x30.spark.connect.CommonInlineUserDefinedDataSourceH\x00R!commonInlineUserDefinedDataSource\x12\x45\n\x0ewith_relations\x18) \x01(\x0b\x32\x1c.spark.connect.WithRelationsH\x00R\rwithRelations\x12\x38\n\ttranspose\x18* \x01(\x0b\x32\x18.spark.connect.TransposeH\x00R\ttranspose\x12w\n unresolved_table_valued_function\x18+ \x01(\x0b\x32,.spark.connect.UnresolvedTableValuedFunctionH\x00R\x1dunresolvedTableValuedFunction\x12?\n\x0clateral_join\x18, \x01(\x0b\x32\x1a.spark.connect.LateralJoinH\x00R\x0blateralJoin\x12.\n\x12referenced_plan_id\x18- \x01(\x03H\x00R\x10referencedPlanId\x12\x30\n\x07\x66ill_na\x18Z \x01(\x0b\x32\x15.spark.connect.NAFillH\x00R\x06\x66illNa\x12\x30\n\x07\x64rop_na\x18[ \x01(\x0b\x32\x15.spark.connect.NADropH\x00R\x06\x64ropNa\x12\x34\n\x07replace\x18\\ \x01(\x0b\x32\x18.spark.connect.NAReplaceH\x00R\x07replace\x12\x36\n\x07summary\x18\x64 \x01(\x0b\x32\x1a.spark.connect.StatSummaryH\x00R\x07summary\x12\x39\n\x08\x63rosstab\x18\x65 \x01(\x0b\x32\x1b.spark.connect.StatCrosstabH\x00R\x08\x63rosstab\x12\x39\n\x08\x64\x65scribe\x18\x66 \x01(\x0b\x32\x1b.spark.connect.StatDescribeH\x00R\x08\x64\x65scribe\x12*\n\x03\x63ov\x18g \x01(\x0b\x32\x16.spark.connect.StatCovH\x00R\x03\x63ov\x12-\n\x04\x63orr\x18h \x01(\x0b\x32\x17.spark.connect.StatCorrH\x00R\x04\x63orr\x12L\n\x0f\x61pprox_quantile\x18i \x01(\x0b\x32!.spark.connect.StatApproxQuantileH\x00R\x0e\x61pproxQuantile\x12=\n\nfreq_items\x18j \x01(\x0b\x32\x1c.spark.connect.StatFreqItemsH\x00R\tfreqItems\x12:\n\tsample_by\x18k \x01(\x0b\x32\x1b.spark.connect.StatSampleByH\x00R\x08sampleBy\x12\x33\n\x07\x63\x61talog\x18\xc8\x01 \x01(\x0b\x32\x16.spark.connect.CatalogH\x00R\x07\x63\x61talog\x12=\n\x0bml_relation\x18\xac\x02 \x01(\x0b\x32\x19.spark.connect.MlRelationH\x00R\nmlRelation\x12\x35\n\textension\x18\xe6\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textension\x12\x33\n\x07unknown\x18\xe7\x07 \x01(\x0b\x32\x16.spark.connect.UnknownH\x00R\x07unknownB\n\n\x08rel_type"\xe4\x03\n\nMlRelation\x12\x43\n\ttransform\x18\x01 \x01(\x0b\x32#.spark.connect.MlRelation.TransformH\x00R\ttransform\x12,\n\x05\x66\x65tch\x18\x02 \x01(\x0b\x32\x14.spark.connect.FetchH\x00R\x05\x66\x65tch\x12P\n\x15model_summary_dataset\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationH\x01R\x13modelSummaryDataset\x88\x01\x01\x1a\xeb\x01\n\tTransform\x12\x33\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefH\x00R\x06objRef\x12=\n\x0btransformer\x18\x02 \x01(\x0b\x32\x19.spark.connect.MlOperatorH\x00R\x0btransformer\x12-\n\x05input\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06params\x18\x04 \x01(\x0b\x32\x17.spark.connect.MlParamsR\x06paramsB\n\n\x08operatorB\t\n\x07ml_typeB\x18\n\x16_model_summary_dataset"\xcb\x02\n\x05\x46\x65tch\x12\x31\n\x07obj_ref\x18\x01 \x01(\x0b\x32\x18.spark.connect.ObjectRefR\x06objRef\x12\x35\n\x07methods\x18\x02 \x03(\x0b\x32\x1b.spark.connect.Fetch.MethodR\x07methods\x1a\xd7\x01\n\x06Method\x12\x16\n\x06method\x18\x01 \x01(\tR\x06method\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32 .spark.connect.Fetch.Method.ArgsR\x04\x61rgs\x1a\x7f\n\x04\x41rgs\x12\x39\n\x05param\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralH\x00R\x05param\x12/\n\x05input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationH\x00R\x05inputB\x0b\n\targs_type"\t\n\x07Unknown"\x8e\x01\n\x0eRelationCommon\x12#\n\x0bsource_info\x18\x01 \x01(\tB\x02\x18\x01R\nsourceInfo\x12\x1c\n\x07plan_id\x18\x02 \x01(\x03H\x00R\x06planId\x88\x01\x01\x12-\n\x06origin\x18\x03 \x01(\x0b\x32\x15.spark.connect.OriginR\x06originB\n\n\x08_plan_id"\xde\x03\n\x03SQL\x12\x14\n\x05query\x18\x01 \x01(\tR\x05query\x12\x34\n\x04\x61rgs\x18\x02 \x03(\x0b\x32\x1c.spark.connect.SQL.ArgsEntryB\x02\x18\x01R\x04\x61rgs\x12@\n\x08pos_args\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralB\x02\x18\x01R\x07posArgs\x12O\n\x0fnamed_arguments\x18\x04 \x03(\x0b\x32&.spark.connect.SQL.NamedArgumentsEntryR\x0enamedArguments\x12>\n\rpos_arguments\x18\x05 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cposArguments\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01\x1a\\\n\x13NamedArgumentsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12/\n\x05value\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x05value:\x02\x38\x01"u\n\rWithRelations\x12+\n\x04root\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04root\x12\x37\n\nreferences\x18\x02 \x03(\x0b\x32\x17.spark.connect.RelationR\nreferences"\x97\x05\n\x04Read\x12\x41\n\x0bnamed_table\x18\x01 \x01(\x0b\x32\x1e.spark.connect.Read.NamedTableH\x00R\nnamedTable\x12\x41\n\x0b\x64\x61ta_source\x18\x02 \x01(\x0b\x32\x1e.spark.connect.Read.DataSourceH\x00R\ndataSource\x12!\n\x0cis_streaming\x18\x03 \x01(\x08R\x0bisStreaming\x1a\xc0\x01\n\nNamedTable\x12/\n\x13unparsed_identifier\x18\x01 \x01(\tR\x12unparsedIdentifier\x12\x45\n\x07options\x18\x02 \x03(\x0b\x32+.spark.connect.Read.NamedTable.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x95\x02\n\nDataSource\x12\x1b\n\x06\x66ormat\x18\x01 \x01(\tH\x00R\x06\x66ormat\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x12\x45\n\x07options\x18\x03 \x03(\x0b\x32+.spark.connect.Read.DataSource.OptionsEntryR\x07options\x12\x14\n\x05paths\x18\x04 \x03(\tR\x05paths\x12\x1e\n\npredicates\x18\x05 \x03(\tR\npredicates\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07_formatB\t\n\x07_schemaB\x0b\n\tread_type"u\n\x07Project\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12;\n\x0b\x65xpressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0b\x65xpressions"p\n\x06\x46ilter\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x37\n\tcondition\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\tcondition"\x95\x05\n\x04Join\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinType\x12#\n\rusing_columns\x18\x05 \x03(\tR\x0cusingColumns\x12K\n\x0ejoin_data_type\x18\x06 \x01(\x0b\x32 .spark.connect.Join.JoinDataTypeH\x00R\x0cjoinDataType\x88\x01\x01\x1a\\\n\x0cJoinDataType\x12$\n\x0eis_left_struct\x18\x01 \x01(\x08R\x0cisLeftStruct\x12&\n\x0fis_right_struct\x18\x02 \x01(\x08R\risRightStruct"\xd0\x01\n\x08JoinType\x12\x19\n\x15JOIN_TYPE_UNSPECIFIED\x10\x00\x12\x13\n\x0fJOIN_TYPE_INNER\x10\x01\x12\x18\n\x14JOIN_TYPE_FULL_OUTER\x10\x02\x12\x18\n\x14JOIN_TYPE_LEFT_OUTER\x10\x03\x12\x19\n\x15JOIN_TYPE_RIGHT_OUTER\x10\x04\x12\x17\n\x13JOIN_TYPE_LEFT_ANTI\x10\x05\x12\x17\n\x13JOIN_TYPE_LEFT_SEMI\x10\x06\x12\x13\n\x0fJOIN_TYPE_CROSS\x10\x07\x42\x11\n\x0f_join_data_type"\xdf\x03\n\x0cSetOperation\x12\x36\n\nleft_input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\tleftInput\x12\x38\n\x0bright_input\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\nrightInput\x12\x45\n\x0bset_op_type\x18\x03 \x01(\x0e\x32%.spark.connect.SetOperation.SetOpTypeR\tsetOpType\x12\x1a\n\x06is_all\x18\x04 \x01(\x08H\x00R\x05isAll\x88\x01\x01\x12\x1c\n\x07\x62y_name\x18\x05 \x01(\x08H\x01R\x06\x62yName\x88\x01\x01\x12\x37\n\x15\x61llow_missing_columns\x18\x06 \x01(\x08H\x02R\x13\x61llowMissingColumns\x88\x01\x01"r\n\tSetOpType\x12\x1b\n\x17SET_OP_TYPE_UNSPECIFIED\x10\x00\x12\x19\n\x15SET_OP_TYPE_INTERSECT\x10\x01\x12\x15\n\x11SET_OP_TYPE_UNION\x10\x02\x12\x16\n\x12SET_OP_TYPE_EXCEPT\x10\x03\x42\t\n\x07_is_allB\n\n\x08_by_nameB\x18\n\x16_allow_missing_columns"L\n\x05Limit\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"O\n\x06Offset\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06offset\x18\x02 \x01(\x05R\x06offset"K\n\x04Tail\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05limit\x18\x02 \x01(\x05R\x05limit"\xfe\x05\n\tAggregate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x41\n\ngroup_type\x18\x02 \x01(\x0e\x32".spark.connect.Aggregate.GroupTypeR\tgroupType\x12L\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12N\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x14\x61ggregateExpressions\x12\x34\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.spark.connect.Aggregate.PivotR\x05pivot\x12J\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.spark.connect.Aggregate.GroupingSetsR\x0cgroupingSets\x1ao\n\x05Pivot\x12+\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x39\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values\x1aL\n\x0cGroupingSets\x12<\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0bgroupingSet"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05"\xa0\x01\n\x04Sort\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x05order\x18\x02 \x03(\x0b\x32#.spark.connect.Expression.SortOrderR\x05order\x12 \n\tis_global\x18\x03 \x01(\x08H\x00R\x08isGlobal\x88\x01\x01\x42\x0c\n\n_is_global"\x8d\x01\n\x04\x44rop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x33\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07\x63olumns\x12!\n\x0c\x63olumn_names\x18\x03 \x03(\tR\x0b\x63olumnNames"\xf0\x01\n\x0b\x44\x65\x64uplicate\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames\x12\x32\n\x13\x61ll_columns_as_keys\x18\x03 \x01(\x08H\x00R\x10\x61llColumnsAsKeys\x88\x01\x01\x12.\n\x10within_watermark\x18\x04 \x01(\x08H\x01R\x0fwithinWatermark\x88\x01\x01\x42\x16\n\x14_all_columns_as_keysB\x13\n\x11_within_watermark"Y\n\rLocalRelation\x12\x17\n\x04\x64\x61ta\x18\x01 \x01(\x0cH\x00R\x04\x64\x61ta\x88\x01\x01\x12\x1b\n\x06schema\x18\x02 \x01(\tH\x01R\x06schema\x88\x01\x01\x42\x07\n\x05_dataB\t\n\x07_schema"H\n\x13\x43\x61\x63hedLocalRelation\x12\x12\n\x04hash\x18\x03 \x01(\tR\x04hashJ\x04\x08\x01\x10\x02J\x04\x08\x02\x10\x03R\x06userIdR\tsessionId"7\n\x14\x43\x61\x63hedRemoteRelation\x12\x1f\n\x0brelation_id\x18\x01 \x01(\tR\nrelationId"\x91\x02\n\x06Sample\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1f\n\x0blower_bound\x18\x02 \x01(\x01R\nlowerBound\x12\x1f\n\x0bupper_bound\x18\x03 \x01(\x01R\nupperBound\x12.\n\x10with_replacement\x18\x04 \x01(\x08H\x00R\x0fwithReplacement\x88\x01\x01\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x01R\x04seed\x88\x01\x01\x12/\n\x13\x64\x65terministic_order\x18\x06 \x01(\x08R\x12\x64\x65terministicOrderB\x13\n\x11_with_replacementB\x07\n\x05_seed"\x91\x01\n\x05Range\x12\x19\n\x05start\x18\x01 \x01(\x03H\x00R\x05start\x88\x01\x01\x12\x10\n\x03\x65nd\x18\x02 \x01(\x03R\x03\x65nd\x12\x12\n\x04step\x18\x03 \x01(\x03R\x04step\x12*\n\x0enum_partitions\x18\x04 \x01(\x05H\x01R\rnumPartitions\x88\x01\x01\x42\x08\n\x06_startB\x11\n\x0f_num_partitions"r\n\rSubqueryAlias\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x14\n\x05\x61lias\x18\x02 \x01(\tR\x05\x61lias\x12\x1c\n\tqualifier\x18\x03 \x03(\tR\tqualifier"\x8e\x01\n\x0bRepartition\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12%\n\x0enum_partitions\x18\x02 \x01(\x05R\rnumPartitions\x12\x1d\n\x07shuffle\x18\x03 \x01(\x08H\x00R\x07shuffle\x88\x01\x01\x42\n\n\x08_shuffle"\x8e\x01\n\nShowString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate\x12\x1a\n\x08vertical\x18\x04 \x01(\x08R\x08vertical"r\n\nHtmlString\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x19\n\x08num_rows\x18\x02 \x01(\x05R\x07numRows\x12\x1a\n\x08truncate\x18\x03 \x01(\x05R\x08truncate"\\\n\x0bStatSummary\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1e\n\nstatistics\x18\x02 \x03(\tR\nstatistics"Q\n\x0cStatDescribe\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols"e\n\x0cStatCrosstab\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"`\n\x07StatCov\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2"\x89\x01\n\x08StatCorr\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ol1\x18\x02 \x01(\tR\x04\x63ol1\x12\x12\n\x04\x63ol2\x18\x03 \x01(\tR\x04\x63ol2\x12\x1b\n\x06method\x18\x04 \x01(\tH\x00R\x06method\x88\x01\x01\x42\t\n\x07_method"\xa4\x01\n\x12StatApproxQuantile\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12$\n\rprobabilities\x18\x03 \x03(\x01R\rprobabilities\x12%\n\x0erelative_error\x18\x04 \x01(\x01R\rrelativeError"}\n\rStatFreqItems\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x1d\n\x07support\x18\x03 \x01(\x01H\x00R\x07support\x88\x01\x01\x42\n\n\x08_support"\xb5\x02\n\x0cStatSampleBy\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03\x63ol\x18\x02 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x03\x63ol\x12\x42\n\tfractions\x18\x03 \x03(\x0b\x32$.spark.connect.StatSampleBy.FractionR\tfractions\x12\x17\n\x04seed\x18\x05 \x01(\x03H\x00R\x04seed\x88\x01\x01\x1a\x63\n\x08\x46raction\x12;\n\x07stratum\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x07stratum\x12\x1a\n\x08\x66raction\x18\x02 \x01(\x01R\x08\x66ractionB\x07\n\x05_seed"\x86\x01\n\x06NAFill\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\x39\n\x06values\x18\x03 \x03(\x0b\x32!.spark.connect.Expression.LiteralR\x06values"\x86\x01\n\x06NADrop\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12\'\n\rmin_non_nulls\x18\x03 \x01(\x05H\x00R\x0bminNonNulls\x88\x01\x01\x42\x10\n\x0e_min_non_nulls"\xa8\x02\n\tNAReplace\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04\x63ols\x18\x02 \x03(\tR\x04\x63ols\x12H\n\x0creplacements\x18\x03 \x03(\x0b\x32$.spark.connect.NAReplace.ReplacementR\x0creplacements\x1a\x8d\x01\n\x0bReplacement\x12>\n\told_value\x18\x01 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08oldValue\x12>\n\tnew_value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x08newValue"X\n\x04ToDF\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12!\n\x0c\x63olumn_names\x18\x02 \x03(\tR\x0b\x63olumnNames"\xfe\x02\n\x12WithColumnsRenamed\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12i\n\x12rename_columns_map\x18\x02 \x03(\x0b\x32\x37.spark.connect.WithColumnsRenamed.RenameColumnsMapEntryB\x02\x18\x01R\x10renameColumnsMap\x12\x42\n\x07renames\x18\x03 \x03(\x0b\x32(.spark.connect.WithColumnsRenamed.RenameR\x07renames\x1a\x43\n\x15RenameColumnsMapEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x45\n\x06Rename\x12\x19\n\x08\x63ol_name\x18\x01 \x01(\tR\x07\x63olName\x12 \n\x0cnew_col_name\x18\x02 \x01(\tR\nnewColName"w\n\x0bWithColumns\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x39\n\x07\x61liases\x18\x02 \x03(\x0b\x32\x1f.spark.connect.Expression.AliasR\x07\x61liases"\x86\x01\n\rWithWatermark\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\nevent_time\x18\x02 \x01(\tR\teventTime\x12\'\n\x0f\x64\x65lay_threshold\x18\x03 \x01(\tR\x0e\x64\x65layThreshold"\x84\x01\n\x04Hint\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x39\n\nparameters\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\nparameters"\xc7\x02\n\x07Unpivot\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12+\n\x03ids\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x03ids\x12:\n\x06values\x18\x03 \x01(\x0b\x32\x1d.spark.connect.Unpivot.ValuesH\x00R\x06values\x88\x01\x01\x12\x30\n\x14variable_column_name\x18\x04 \x01(\tR\x12variableColumnName\x12*\n\x11value_column_name\x18\x05 \x01(\tR\x0fvalueColumnName\x1a;\n\x06Values\x12\x31\n\x06values\x18\x01 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x06valuesB\t\n\x07_values"z\n\tTranspose\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12>\n\rindex_columns\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0cindexColumns"}\n\x1dUnresolvedTableValuedFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12\x37\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments"j\n\x08ToSchema\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12/\n\x06schema\x18\x02 \x01(\x0b\x32\x17.spark.connect.DataTypeR\x06schema"\xcb\x01\n\x17RepartitionByExpression\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x0fpartition_exprs\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x0epartitionExprs\x12*\n\x0enum_partitions\x18\x03 \x01(\x05H\x00R\rnumPartitions\x88\x01\x01\x42\x11\n\x0f_num_partitions"\xe8\x01\n\rMapPartitions\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x42\n\x04\x66unc\x18\x02 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12"\n\nis_barrier\x18\x03 \x01(\x08H\x00R\tisBarrier\x88\x01\x01\x12"\n\nprofile_id\x18\x04 \x01(\x05H\x01R\tprofileId\x88\x01\x01\x42\r\n\x0b_is_barrierB\r\n\x0b_profile_id"\xd2\x06\n\x08GroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12J\n\x13sorting_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x12sortingExpressions\x12<\n\rinitial_input\x18\x05 \x01(\x0b\x32\x17.spark.connect.RelationR\x0cinitialInput\x12[\n\x1cinitial_grouping_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x1ainitialGroupingExpressions\x12;\n\x18is_map_groups_with_state\x18\x07 \x01(\x08H\x00R\x14isMapGroupsWithState\x88\x01\x01\x12$\n\x0boutput_mode\x18\x08 \x01(\tH\x01R\noutputMode\x88\x01\x01\x12&\n\x0ctimeout_conf\x18\t \x01(\tH\x02R\x0btimeoutConf\x88\x01\x01\x12?\n\x0cstate_schema\x18\n \x01(\x0b\x32\x17.spark.connect.DataTypeH\x03R\x0bstateSchema\x88\x01\x01\x12\x65\n\x19transform_with_state_info\x18\x0b \x01(\x0b\x32%.spark.connect.TransformWithStateInfoH\x04R\x16transformWithStateInfo\x88\x01\x01\x42\x1b\n\x19_is_map_groups_with_stateB\x0e\n\x0c_output_modeB\x0f\n\r_timeout_confB\x0f\n\r_state_schemaB\x1c\n\x1a_transform_with_state_info"\xdf\x01\n\x16TransformWithStateInfo\x12\x1b\n\ttime_mode\x18\x01 \x01(\tR\x08timeMode\x12\x38\n\x16\x65vent_time_column_name\x18\x02 \x01(\tH\x00R\x13\x65ventTimeColumnName\x88\x01\x01\x12\x41\n\routput_schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x01R\x0coutputSchema\x88\x01\x01\x42\x19\n\x17_event_time_column_nameB\x10\n\x0e_output_schema"\x8e\x04\n\nCoGroupMap\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12W\n\x1ainput_grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18inputGroupingExpressions\x12-\n\x05other\x18\x03 \x01(\x0b\x32\x17.spark.connect.RelationR\x05other\x12W\n\x1aother_grouping_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x18otherGroupingExpressions\x12\x42\n\x04\x66unc\x18\x05 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12U\n\x19input_sorting_expressions\x18\x06 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17inputSortingExpressions\x12U\n\x19other_sorting_expressions\x18\x07 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x17otherSortingExpressions"\xe5\x02\n\x16\x41pplyInPandasWithState\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12L\n\x14grouping_expressions\x18\x02 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13groupingExpressions\x12\x42\n\x04\x66unc\x18\x03 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionR\x04\x66unc\x12#\n\routput_schema\x18\x04 \x01(\tR\x0coutputSchema\x12!\n\x0cstate_schema\x18\x05 \x01(\tR\x0bstateSchema\x12\x1f\n\x0boutput_mode\x18\x06 \x01(\tR\noutputMode\x12!\n\x0ctimeout_conf\x18\x07 \x01(\tR\x0btimeoutConf"\xf4\x01\n$CommonInlineUserDefinedTableFunction\x12#\n\rfunction_name\x18\x01 \x01(\tR\x0c\x66unctionName\x12$\n\rdeterministic\x18\x02 \x01(\x08R\rdeterministic\x12\x37\n\targuments\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\targuments\x12<\n\x0bpython_udtf\x18\x04 \x01(\x0b\x32\x19.spark.connect.PythonUDTFH\x00R\npythonUdtfB\n\n\x08\x66unction"\xb1\x01\n\nPythonUDTF\x12=\n\x0breturn_type\x18\x01 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\nreturnType\x88\x01\x01\x12\x1b\n\teval_type\x18\x02 \x01(\x05R\x08\x65valType\x12\x18\n\x07\x63ommand\x18\x03 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x04 \x01(\tR\tpythonVerB\x0e\n\x0c_return_type"\x97\x01\n!CommonInlineUserDefinedDataSource\x12\x12\n\x04name\x18\x01 \x01(\tR\x04name\x12O\n\x12python_data_source\x18\x02 \x01(\x0b\x32\x1f.spark.connect.PythonDataSourceH\x00R\x10pythonDataSourceB\r\n\x0b\x64\x61ta_source"K\n\x10PythonDataSource\x12\x18\n\x07\x63ommand\x18\x01 \x01(\x0cR\x07\x63ommand\x12\x1d\n\npython_ver\x18\x02 \x01(\tR\tpythonVer"\x88\x01\n\x0e\x43ollectMetrics\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x33\n\x07metrics\x18\x03 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x07metrics"\x84\x03\n\x05Parse\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x38\n\x06\x66ormat\x18\x02 \x01(\x0e\x32 .spark.connect.Parse.ParseFormatR\x06\x66ormat\x12\x34\n\x06schema\x18\x03 \x01(\x0b\x32\x17.spark.connect.DataTypeH\x00R\x06schema\x88\x01\x01\x12;\n\x07options\x18\x04 \x03(\x0b\x32!.spark.connect.Parse.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"X\n\x0bParseFormat\x12\x1c\n\x18PARSE_FORMAT_UNSPECIFIED\x10\x00\x12\x14\n\x10PARSE_FORMAT_CSV\x10\x01\x12\x15\n\x11PARSE_FORMAT_JSON\x10\x02\x42\t\n\x07_schema"\xdb\x03\n\x08\x41sOfJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12\x37\n\nleft_as_of\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08leftAsOf\x12\x39\n\x0bright_as_of\x18\x04 \x01(\x0b\x32\x19.spark.connect.ExpressionR\trightAsOf\x12\x36\n\tjoin_expr\x18\x05 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x08joinExpr\x12#\n\rusing_columns\x18\x06 \x03(\tR\x0cusingColumns\x12\x1b\n\tjoin_type\x18\x07 \x01(\tR\x08joinType\x12\x37\n\ttolerance\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\ttolerance\x12.\n\x13\x61llow_exact_matches\x18\t \x01(\x08R\x11\x61llowExactMatches\x12\x1c\n\tdirection\x18\n \x01(\tR\tdirection"\xe6\x01\n\x0bLateralJoin\x12+\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x04left\x12-\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.RelationR\x05right\x12@\n\x0ejoin_condition\x18\x03 \x01(\x0b\x32\x19.spark.connect.ExpressionR\rjoinCondition\x12\x39\n\tjoin_type\x18\x04 \x01(\x0e\x32\x1c.spark.connect.Join.JoinTypeR\x08joinTypeB6\n\x1eorg.apache.spark.connect.protoP\x01Z\x12internal/generatedb\x06proto3' ) _globals = globals() @@ -79,171 +79,171 @@ _globals["_PARSE_OPTIONSENTRY"]._loaded_options = None _globals["_PARSE_OPTIONSENTRY"]._serialized_options = b"8\001" _globals["_RELATION"]._serialized_start = 224 - _globals["_RELATION"]._serialized_end = 3964 - _globals["_MLRELATION"]._serialized_start = 3967 - _globals["_MLRELATION"]._serialized_end = 4451 - _globals["_MLRELATION_TRANSFORM"]._serialized_start = 4179 - _globals["_MLRELATION_TRANSFORM"]._serialized_end = 4414 - _globals["_FETCH"]._serialized_start = 4454 - _globals["_FETCH"]._serialized_end = 4785 - _globals["_FETCH_METHOD"]._serialized_start = 4570 - _globals["_FETCH_METHOD"]._serialized_end = 4785 - _globals["_FETCH_METHOD_ARGS"]._serialized_start = 4658 - _globals["_FETCH_METHOD_ARGS"]._serialized_end = 4785 - _globals["_UNKNOWN"]._serialized_start = 4787 - _globals["_UNKNOWN"]._serialized_end = 4796 - _globals["_RELATIONCOMMON"]._serialized_start = 4799 - _globals["_RELATIONCOMMON"]._serialized_end = 4941 - _globals["_SQL"]._serialized_start = 4944 - _globals["_SQL"]._serialized_end = 5422 - _globals["_SQL_ARGSENTRY"]._serialized_start = 5238 - _globals["_SQL_ARGSENTRY"]._serialized_end = 5328 - _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_start = 5330 - _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_end = 5422 - _globals["_WITHRELATIONS"]._serialized_start = 5424 - _globals["_WITHRELATIONS"]._serialized_end = 5541 - _globals["_READ"]._serialized_start = 5544 - _globals["_READ"]._serialized_end = 6207 - _globals["_READ_NAMEDTABLE"]._serialized_start = 5722 - _globals["_READ_NAMEDTABLE"]._serialized_end = 5914 - _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_start = 5856 - _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_end = 5914 - _globals["_READ_DATASOURCE"]._serialized_start = 5917 - _globals["_READ_DATASOURCE"]._serialized_end = 6194 - _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_start = 5856 - _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_end = 5914 - _globals["_PROJECT"]._serialized_start = 6209 - _globals["_PROJECT"]._serialized_end = 6326 - _globals["_FILTER"]._serialized_start = 6328 - _globals["_FILTER"]._serialized_end = 6440 - _globals["_JOIN"]._serialized_start = 6443 - _globals["_JOIN"]._serialized_end = 7104 - _globals["_JOIN_JOINDATATYPE"]._serialized_start = 6782 - _globals["_JOIN_JOINDATATYPE"]._serialized_end = 6874 - _globals["_JOIN_JOINTYPE"]._serialized_start = 6877 - _globals["_JOIN_JOINTYPE"]._serialized_end = 7085 - _globals["_SETOPERATION"]._serialized_start = 7107 - _globals["_SETOPERATION"]._serialized_end = 7586 - _globals["_SETOPERATION_SETOPTYPE"]._serialized_start = 7423 - _globals["_SETOPERATION_SETOPTYPE"]._serialized_end = 7537 - _globals["_LIMIT"]._serialized_start = 7588 - _globals["_LIMIT"]._serialized_end = 7664 - _globals["_OFFSET"]._serialized_start = 7666 - _globals["_OFFSET"]._serialized_end = 7745 - _globals["_TAIL"]._serialized_start = 7747 - _globals["_TAIL"]._serialized_end = 7822 - _globals["_AGGREGATE"]._serialized_start = 7825 - _globals["_AGGREGATE"]._serialized_end = 8591 - _globals["_AGGREGATE_PIVOT"]._serialized_start = 8240 - _globals["_AGGREGATE_PIVOT"]._serialized_end = 8351 - _globals["_AGGREGATE_GROUPINGSETS"]._serialized_start = 8353 - _globals["_AGGREGATE_GROUPINGSETS"]._serialized_end = 8429 - _globals["_AGGREGATE_GROUPTYPE"]._serialized_start = 8432 - _globals["_AGGREGATE_GROUPTYPE"]._serialized_end = 8591 - _globals["_SORT"]._serialized_start = 8594 - _globals["_SORT"]._serialized_end = 8754 - _globals["_DROP"]._serialized_start = 8757 - _globals["_DROP"]._serialized_end = 8898 - _globals["_DEDUPLICATE"]._serialized_start = 8901 - _globals["_DEDUPLICATE"]._serialized_end = 9141 - _globals["_LOCALRELATION"]._serialized_start = 9143 - _globals["_LOCALRELATION"]._serialized_end = 9232 - _globals["_CACHEDLOCALRELATION"]._serialized_start = 9234 - _globals["_CACHEDLOCALRELATION"]._serialized_end = 9306 - _globals["_CACHEDREMOTERELATION"]._serialized_start = 9308 - _globals["_CACHEDREMOTERELATION"]._serialized_end = 9363 - _globals["_SAMPLE"]._serialized_start = 9366 - _globals["_SAMPLE"]._serialized_end = 9639 - _globals["_RANGE"]._serialized_start = 9642 - _globals["_RANGE"]._serialized_end = 9787 - _globals["_SUBQUERYALIAS"]._serialized_start = 9789 - _globals["_SUBQUERYALIAS"]._serialized_end = 9903 - _globals["_REPARTITION"]._serialized_start = 9906 - _globals["_REPARTITION"]._serialized_end = 10048 - _globals["_SHOWSTRING"]._serialized_start = 10051 - _globals["_SHOWSTRING"]._serialized_end = 10193 - _globals["_HTMLSTRING"]._serialized_start = 10195 - _globals["_HTMLSTRING"]._serialized_end = 10309 - _globals["_STATSUMMARY"]._serialized_start = 10311 - _globals["_STATSUMMARY"]._serialized_end = 10403 - _globals["_STATDESCRIBE"]._serialized_start = 10405 - _globals["_STATDESCRIBE"]._serialized_end = 10486 - _globals["_STATCROSSTAB"]._serialized_start = 10488 - _globals["_STATCROSSTAB"]._serialized_end = 10589 - _globals["_STATCOV"]._serialized_start = 10591 - _globals["_STATCOV"]._serialized_end = 10687 - _globals["_STATCORR"]._serialized_start = 10690 - _globals["_STATCORR"]._serialized_end = 10827 - _globals["_STATAPPROXQUANTILE"]._serialized_start = 10830 - _globals["_STATAPPROXQUANTILE"]._serialized_end = 10994 - _globals["_STATFREQITEMS"]._serialized_start = 10996 - _globals["_STATFREQITEMS"]._serialized_end = 11121 - _globals["_STATSAMPLEBY"]._serialized_start = 11124 - _globals["_STATSAMPLEBY"]._serialized_end = 11433 - _globals["_STATSAMPLEBY_FRACTION"]._serialized_start = 11325 - _globals["_STATSAMPLEBY_FRACTION"]._serialized_end = 11424 - _globals["_NAFILL"]._serialized_start = 11436 - _globals["_NAFILL"]._serialized_end = 11570 - _globals["_NADROP"]._serialized_start = 11573 - _globals["_NADROP"]._serialized_end = 11707 - _globals["_NAREPLACE"]._serialized_start = 11710 - _globals["_NAREPLACE"]._serialized_end = 12006 - _globals["_NAREPLACE_REPLACEMENT"]._serialized_start = 11865 - _globals["_NAREPLACE_REPLACEMENT"]._serialized_end = 12006 - _globals["_TODF"]._serialized_start = 12008 - _globals["_TODF"]._serialized_end = 12096 - _globals["_WITHCOLUMNSRENAMED"]._serialized_start = 12099 - _globals["_WITHCOLUMNSRENAMED"]._serialized_end = 12481 - _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_start = 12343 - _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_end = 12410 - _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_start = 12412 - _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_end = 12481 - _globals["_WITHCOLUMNS"]._serialized_start = 12483 - _globals["_WITHCOLUMNS"]._serialized_end = 12602 - _globals["_WITHWATERMARK"]._serialized_start = 12605 - _globals["_WITHWATERMARK"]._serialized_end = 12739 - _globals["_HINT"]._serialized_start = 12742 - _globals["_HINT"]._serialized_end = 12874 - _globals["_UNPIVOT"]._serialized_start = 12877 - _globals["_UNPIVOT"]._serialized_end = 13204 - _globals["_UNPIVOT_VALUES"]._serialized_start = 13134 - _globals["_UNPIVOT_VALUES"]._serialized_end = 13193 - _globals["_TRANSPOSE"]._serialized_start = 13206 - _globals["_TRANSPOSE"]._serialized_end = 13328 - _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_start = 13330 - _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_end = 13455 - _globals["_TOSCHEMA"]._serialized_start = 13457 - _globals["_TOSCHEMA"]._serialized_end = 13563 - _globals["_REPARTITIONBYEXPRESSION"]._serialized_start = 13566 - _globals["_REPARTITIONBYEXPRESSION"]._serialized_end = 13769 - _globals["_MAPPARTITIONS"]._serialized_start = 13772 - _globals["_MAPPARTITIONS"]._serialized_end = 14004 - _globals["_GROUPMAP"]._serialized_start = 14007 - _globals["_GROUPMAP"]._serialized_end = 14857 - _globals["_TRANSFORMWITHSTATEINFO"]._serialized_start = 14860 - _globals["_TRANSFORMWITHSTATEINFO"]._serialized_end = 15083 - _globals["_COGROUPMAP"]._serialized_start = 15086 - _globals["_COGROUPMAP"]._serialized_end = 15612 - _globals["_APPLYINPANDASWITHSTATE"]._serialized_start = 15615 - _globals["_APPLYINPANDASWITHSTATE"]._serialized_end = 15972 - _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_start = 15975 - _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_end = 16219 - _globals["_PYTHONUDTF"]._serialized_start = 16222 - _globals["_PYTHONUDTF"]._serialized_end = 16399 - _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_start = 16402 - _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_end = 16553 - _globals["_PYTHONDATASOURCE"]._serialized_start = 16555 - _globals["_PYTHONDATASOURCE"]._serialized_end = 16630 - _globals["_COLLECTMETRICS"]._serialized_start = 16633 - _globals["_COLLECTMETRICS"]._serialized_end = 16769 - _globals["_PARSE"]._serialized_start = 16772 - _globals["_PARSE"]._serialized_end = 17160 - _globals["_PARSE_OPTIONSENTRY"]._serialized_start = 5856 - _globals["_PARSE_OPTIONSENTRY"]._serialized_end = 5914 - _globals["_PARSE_PARSEFORMAT"]._serialized_start = 17061 - _globals["_PARSE_PARSEFORMAT"]._serialized_end = 17149 - _globals["_ASOFJOIN"]._serialized_start = 17163 - _globals["_ASOFJOIN"]._serialized_end = 17638 - _globals["_LATERALJOIN"]._serialized_start = 17641 - _globals["_LATERALJOIN"]._serialized_end = 17871 + _globals["_RELATION"]._serialized_end = 4012 + _globals["_MLRELATION"]._serialized_start = 4015 + _globals["_MLRELATION"]._serialized_end = 4499 + _globals["_MLRELATION_TRANSFORM"]._serialized_start = 4227 + _globals["_MLRELATION_TRANSFORM"]._serialized_end = 4462 + _globals["_FETCH"]._serialized_start = 4502 + _globals["_FETCH"]._serialized_end = 4833 + _globals["_FETCH_METHOD"]._serialized_start = 4618 + _globals["_FETCH_METHOD"]._serialized_end = 4833 + _globals["_FETCH_METHOD_ARGS"]._serialized_start = 4706 + _globals["_FETCH_METHOD_ARGS"]._serialized_end = 4833 + _globals["_UNKNOWN"]._serialized_start = 4835 + _globals["_UNKNOWN"]._serialized_end = 4844 + _globals["_RELATIONCOMMON"]._serialized_start = 4847 + _globals["_RELATIONCOMMON"]._serialized_end = 4989 + _globals["_SQL"]._serialized_start = 4992 + _globals["_SQL"]._serialized_end = 5470 + _globals["_SQL_ARGSENTRY"]._serialized_start = 5286 + _globals["_SQL_ARGSENTRY"]._serialized_end = 5376 + _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_start = 5378 + _globals["_SQL_NAMEDARGUMENTSENTRY"]._serialized_end = 5470 + _globals["_WITHRELATIONS"]._serialized_start = 5472 + _globals["_WITHRELATIONS"]._serialized_end = 5589 + _globals["_READ"]._serialized_start = 5592 + _globals["_READ"]._serialized_end = 6255 + _globals["_READ_NAMEDTABLE"]._serialized_start = 5770 + _globals["_READ_NAMEDTABLE"]._serialized_end = 5962 + _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_start = 5904 + _globals["_READ_NAMEDTABLE_OPTIONSENTRY"]._serialized_end = 5962 + _globals["_READ_DATASOURCE"]._serialized_start = 5965 + _globals["_READ_DATASOURCE"]._serialized_end = 6242 + _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_start = 5904 + _globals["_READ_DATASOURCE_OPTIONSENTRY"]._serialized_end = 5962 + _globals["_PROJECT"]._serialized_start = 6257 + _globals["_PROJECT"]._serialized_end = 6374 + _globals["_FILTER"]._serialized_start = 6376 + _globals["_FILTER"]._serialized_end = 6488 + _globals["_JOIN"]._serialized_start = 6491 + _globals["_JOIN"]._serialized_end = 7152 + _globals["_JOIN_JOINDATATYPE"]._serialized_start = 6830 + _globals["_JOIN_JOINDATATYPE"]._serialized_end = 6922 + _globals["_JOIN_JOINTYPE"]._serialized_start = 6925 + _globals["_JOIN_JOINTYPE"]._serialized_end = 7133 + _globals["_SETOPERATION"]._serialized_start = 7155 + _globals["_SETOPERATION"]._serialized_end = 7634 + _globals["_SETOPERATION_SETOPTYPE"]._serialized_start = 7471 + _globals["_SETOPERATION_SETOPTYPE"]._serialized_end = 7585 + _globals["_LIMIT"]._serialized_start = 7636 + _globals["_LIMIT"]._serialized_end = 7712 + _globals["_OFFSET"]._serialized_start = 7714 + _globals["_OFFSET"]._serialized_end = 7793 + _globals["_TAIL"]._serialized_start = 7795 + _globals["_TAIL"]._serialized_end = 7870 + _globals["_AGGREGATE"]._serialized_start = 7873 + _globals["_AGGREGATE"]._serialized_end = 8639 + _globals["_AGGREGATE_PIVOT"]._serialized_start = 8288 + _globals["_AGGREGATE_PIVOT"]._serialized_end = 8399 + _globals["_AGGREGATE_GROUPINGSETS"]._serialized_start = 8401 + _globals["_AGGREGATE_GROUPINGSETS"]._serialized_end = 8477 + _globals["_AGGREGATE_GROUPTYPE"]._serialized_start = 8480 + _globals["_AGGREGATE_GROUPTYPE"]._serialized_end = 8639 + _globals["_SORT"]._serialized_start = 8642 + _globals["_SORT"]._serialized_end = 8802 + _globals["_DROP"]._serialized_start = 8805 + _globals["_DROP"]._serialized_end = 8946 + _globals["_DEDUPLICATE"]._serialized_start = 8949 + _globals["_DEDUPLICATE"]._serialized_end = 9189 + _globals["_LOCALRELATION"]._serialized_start = 9191 + _globals["_LOCALRELATION"]._serialized_end = 9280 + _globals["_CACHEDLOCALRELATION"]._serialized_start = 9282 + _globals["_CACHEDLOCALRELATION"]._serialized_end = 9354 + _globals["_CACHEDREMOTERELATION"]._serialized_start = 9356 + _globals["_CACHEDREMOTERELATION"]._serialized_end = 9411 + _globals["_SAMPLE"]._serialized_start = 9414 + _globals["_SAMPLE"]._serialized_end = 9687 + _globals["_RANGE"]._serialized_start = 9690 + _globals["_RANGE"]._serialized_end = 9835 + _globals["_SUBQUERYALIAS"]._serialized_start = 9837 + _globals["_SUBQUERYALIAS"]._serialized_end = 9951 + _globals["_REPARTITION"]._serialized_start = 9954 + _globals["_REPARTITION"]._serialized_end = 10096 + _globals["_SHOWSTRING"]._serialized_start = 10099 + _globals["_SHOWSTRING"]._serialized_end = 10241 + _globals["_HTMLSTRING"]._serialized_start = 10243 + _globals["_HTMLSTRING"]._serialized_end = 10357 + _globals["_STATSUMMARY"]._serialized_start = 10359 + _globals["_STATSUMMARY"]._serialized_end = 10451 + _globals["_STATDESCRIBE"]._serialized_start = 10453 + _globals["_STATDESCRIBE"]._serialized_end = 10534 + _globals["_STATCROSSTAB"]._serialized_start = 10536 + _globals["_STATCROSSTAB"]._serialized_end = 10637 + _globals["_STATCOV"]._serialized_start = 10639 + _globals["_STATCOV"]._serialized_end = 10735 + _globals["_STATCORR"]._serialized_start = 10738 + _globals["_STATCORR"]._serialized_end = 10875 + _globals["_STATAPPROXQUANTILE"]._serialized_start = 10878 + _globals["_STATAPPROXQUANTILE"]._serialized_end = 11042 + _globals["_STATFREQITEMS"]._serialized_start = 11044 + _globals["_STATFREQITEMS"]._serialized_end = 11169 + _globals["_STATSAMPLEBY"]._serialized_start = 11172 + _globals["_STATSAMPLEBY"]._serialized_end = 11481 + _globals["_STATSAMPLEBY_FRACTION"]._serialized_start = 11373 + _globals["_STATSAMPLEBY_FRACTION"]._serialized_end = 11472 + _globals["_NAFILL"]._serialized_start = 11484 + _globals["_NAFILL"]._serialized_end = 11618 + _globals["_NADROP"]._serialized_start = 11621 + _globals["_NADROP"]._serialized_end = 11755 + _globals["_NAREPLACE"]._serialized_start = 11758 + _globals["_NAREPLACE"]._serialized_end = 12054 + _globals["_NAREPLACE_REPLACEMENT"]._serialized_start = 11913 + _globals["_NAREPLACE_REPLACEMENT"]._serialized_end = 12054 + _globals["_TODF"]._serialized_start = 12056 + _globals["_TODF"]._serialized_end = 12144 + _globals["_WITHCOLUMNSRENAMED"]._serialized_start = 12147 + _globals["_WITHCOLUMNSRENAMED"]._serialized_end = 12529 + _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_start = 12391 + _globals["_WITHCOLUMNSRENAMED_RENAMECOLUMNSMAPENTRY"]._serialized_end = 12458 + _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_start = 12460 + _globals["_WITHCOLUMNSRENAMED_RENAME"]._serialized_end = 12529 + _globals["_WITHCOLUMNS"]._serialized_start = 12531 + _globals["_WITHCOLUMNS"]._serialized_end = 12650 + _globals["_WITHWATERMARK"]._serialized_start = 12653 + _globals["_WITHWATERMARK"]._serialized_end = 12787 + _globals["_HINT"]._serialized_start = 12790 + _globals["_HINT"]._serialized_end = 12922 + _globals["_UNPIVOT"]._serialized_start = 12925 + _globals["_UNPIVOT"]._serialized_end = 13252 + _globals["_UNPIVOT_VALUES"]._serialized_start = 13182 + _globals["_UNPIVOT_VALUES"]._serialized_end = 13241 + _globals["_TRANSPOSE"]._serialized_start = 13254 + _globals["_TRANSPOSE"]._serialized_end = 13376 + _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_start = 13378 + _globals["_UNRESOLVEDTABLEVALUEDFUNCTION"]._serialized_end = 13503 + _globals["_TOSCHEMA"]._serialized_start = 13505 + _globals["_TOSCHEMA"]._serialized_end = 13611 + _globals["_REPARTITIONBYEXPRESSION"]._serialized_start = 13614 + _globals["_REPARTITIONBYEXPRESSION"]._serialized_end = 13817 + _globals["_MAPPARTITIONS"]._serialized_start = 13820 + _globals["_MAPPARTITIONS"]._serialized_end = 14052 + _globals["_GROUPMAP"]._serialized_start = 14055 + _globals["_GROUPMAP"]._serialized_end = 14905 + _globals["_TRANSFORMWITHSTATEINFO"]._serialized_start = 14908 + _globals["_TRANSFORMWITHSTATEINFO"]._serialized_end = 15131 + _globals["_COGROUPMAP"]._serialized_start = 15134 + _globals["_COGROUPMAP"]._serialized_end = 15660 + _globals["_APPLYINPANDASWITHSTATE"]._serialized_start = 15663 + _globals["_APPLYINPANDASWITHSTATE"]._serialized_end = 16020 + _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_start = 16023 + _globals["_COMMONINLINEUSERDEFINEDTABLEFUNCTION"]._serialized_end = 16267 + _globals["_PYTHONUDTF"]._serialized_start = 16270 + _globals["_PYTHONUDTF"]._serialized_end = 16447 + _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_start = 16450 + _globals["_COMMONINLINEUSERDEFINEDDATASOURCE"]._serialized_end = 16601 + _globals["_PYTHONDATASOURCE"]._serialized_start = 16603 + _globals["_PYTHONDATASOURCE"]._serialized_end = 16678 + _globals["_COLLECTMETRICS"]._serialized_start = 16681 + _globals["_COLLECTMETRICS"]._serialized_end = 16817 + _globals["_PARSE"]._serialized_start = 16820 + _globals["_PARSE"]._serialized_end = 17208 + _globals["_PARSE_OPTIONSENTRY"]._serialized_start = 5904 + _globals["_PARSE_OPTIONSENTRY"]._serialized_end = 5962 + _globals["_PARSE_PARSEFORMAT"]._serialized_start = 17109 + _globals["_PARSE_PARSEFORMAT"]._serialized_end = 17197 + _globals["_ASOFJOIN"]._serialized_start = 17211 + _globals["_ASOFJOIN"]._serialized_end = 17686 + _globals["_LATERALJOIN"]._serialized_start = 17689 + _globals["_LATERALJOIN"]._serialized_end = 17919 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/relations_pb2.pyi b/python/pyspark/sql/connect/proto/relations_pb2.pyi index e1eb7945c19f0..d1c57f6dc38e4 100644 --- a/python/pyspark/sql/connect/proto/relations_pb2.pyi +++ b/python/pyspark/sql/connect/proto/relations_pb2.pyi @@ -108,6 +108,7 @@ class Relation(google.protobuf.message.Message): TRANSPOSE_FIELD_NUMBER: builtins.int UNRESOLVED_TABLE_VALUED_FUNCTION_FIELD_NUMBER: builtins.int LATERAL_JOIN_FIELD_NUMBER: builtins.int + REFERENCED_PLAN_ID_FIELD_NUMBER: builtins.int FILL_NA_FIELD_NUMBER: builtins.int DROP_NA_FIELD_NUMBER: builtins.int REPLACE_FIELD_NUMBER: builtins.int @@ -215,6 +216,14 @@ class Relation(google.protobuf.message.Message): def unresolved_table_valued_function(self) -> global___UnresolvedTableValuedFunction: ... @property def lateral_join(self) -> global___LateralJoin: ... + referenced_plan_id: builtins.int + """Reference to a node else where in the tree. There are two use cases for this: + 1. Reduce tree duplication. In this case the tree contains two or more subtrees that are + identical. The referenced plan can only be a back reference, to a subtree that was + already visited by the planner. The planner is expected to visit the tree bottom-up from + left to right. + 1. Reduce tree depth. + """ @property def fill_na(self) -> global___NAFill: """NA functions""" @@ -301,6 +310,7 @@ class Relation(google.protobuf.message.Message): transpose: global___Transpose | None = ..., unresolved_table_valued_function: global___UnresolvedTableValuedFunction | None = ..., lateral_join: global___LateralJoin | None = ..., + referenced_plan_id: builtins.int = ..., fill_na: global___NAFill | None = ..., drop_na: global___NADrop | None = ..., replace: global___NAReplace | None = ..., @@ -394,6 +404,8 @@ class Relation(google.protobuf.message.Message): b"range", "read", b"read", + "referenced_plan_id", + b"referenced_plan_id", "rel_type", b"rel_type", "repartition", @@ -519,6 +531,8 @@ class Relation(google.protobuf.message.Message): b"range", "read", b"read", + "referenced_plan_id", + b"referenced_plan_id", "rel_type", b"rel_type", "repartition", @@ -614,6 +628,7 @@ class Relation(google.protobuf.message.Message): "transpose", "unresolved_table_valued_function", "lateral_join", + "referenced_plan_id", "fill_na", "drop_na", "replace", diff --git a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala index 03e8d011b8d5f..57fc98046f295 100644 --- a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala +++ b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala @@ -25,6 +25,7 @@ import scala.collection.mutable import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.{DurationInt, FiniteDuration} import scala.jdk.CollectionConverters._ +import scala.util.Random import org.apache.commons.io.FileUtils import org.apache.commons.io.output.TeeOutputStream @@ -1671,6 +1672,26 @@ class ClientE2ETestSuite checkAnswer(df, (0 until 6).map(i => Row(i))) } + test("Execute optimized plan - 33 duplicate local relations") { + val implicits = spark.implicits + import implicits._ + val rng = new Random(61209389765L) + val data = IndexedSeq.tabulate(128) { id => + id -> rng.nextBytes(1024) + } + val input = data.toDF("key", "value") + val unions = Iterator.range(0, 5).foldLeft(input) { + case (current, _) => current.union(current) + } + val df = unions.filter($"key".isin(input.select($"key").filter($"key" < 5))) + .groupBy($"key", $"value") + .count() + val compressionRatio = + df.optimizedPlan.getSerializedSize.toDouble / df.plan.getSerializedSize.toDouble + assert(compressionRatio < (1.0d / 32.0d)) // It should be very close to a 1/33 ratio. + checkAnswer(df, data.take(5).map(kv => Row(kv._1, kv._2, 32L))) + } + test("SPARK-52770: Support Time type") { val df = spark.sql("SELECT TIME '12:13:14'") diff --git a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/PlanOptimizerSuite.scala b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/PlanOptimizerSuite.scala new file mode 100644 index 0000000000000..069c677b016d6 --- /dev/null +++ b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/PlanOptimizerSuite.scala @@ -0,0 +1,469 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.util.TimeZone +import java.util.concurrent.atomic.AtomicLong + +import scala.collection.mutable +import scala.util.Random + +import com.google.protobuf.{Any => PAny} +import io.grpc.inprocess.InProcessChannelBuilder +import org.apache.arrow.memory.RootAllocator +import org.apache.commons.lang3.mutable.MutableInt +import org.scalatest.BeforeAndAfterEach + +import org.apache.spark.connect.proto +import org.apache.spark.sql.{Column, Encoder, Encoders} +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.agnosticEncoderFor +import org.apache.spark.sql.connect.{ColumnNodeToProtoConverter, DataFrame, Dataset, SparkSession} +import org.apache.spark.sql.connect.client.arrow.ArrowSerializer +import org.apache.spark.sql.connect.test.ConnectFunSuite +import org.apache.spark.sql.functions.{col, max, min} +import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout} +import org.apache.spark.sql.types.StructType + +/** + * Test suite for the [[PlanOptimizer]]. + */ +class PlanOptimizerSuite extends ConnectFunSuite with BeforeAndAfterEach { + import PlanOptimizer.PlanId + + private implicit val longEncoder: Encoder[Long] = Encoders.scalaLong + + private implicit val longLongTupleEncoder: Encoder[(Long, Long)] = + Encoders.tuple(longEncoder, longEncoder) + + private var spark: SparkSession = _ + + private def newSparkSession(): SparkSession = { + val client = SparkConnectClient( + InProcessChannelBuilder.forName(getClass.getName).directExecutor().build()) + val session = new SparkSession(client, planIdGenerator = new AtomicLong) + session.releaseSessionOnClose = false + session + } + + override def beforeEach(): Unit = { + super.beforeEach() + spark = newSparkSession() + } + + override def afterEach(): Unit = { + try { + if (spark != null) { + spark.close() + spark = null + } + } finally { + super.afterEach() + } + } + + private case class PlanStats( + numRelations: Int, + planIds: Set[Long], + numDuplicatePlanIds: Int, + numDuplicateRelations: Int = 0) + + private def collectPlanStats(plan: proto.Plan): PlanStats = { + assert(plan.hasRoot) + var numRelations = 0 + val plansIdCounts = mutable.Map.empty[Long, MutableInt] + RelationTreeUtils.visit(plan.getRoot) { relation => + PlanId.get(relation).foreach { id => + plansIdCounts.getOrElseUpdate(id, new MutableInt(0)).incrementAndGet() + } + numRelations += 1 + true + } + PlanStats( + numRelations, + plansIdCounts.keySet.toSet, + plansIdCounts.map(_._2.intValue()).count(_ > 1), + plansIdCounts.map(_._2.intValue()).filter(_ > 1).sum + ) + } + + private def checkNoDeduplication(df: Dataset[_]): Unit = { + val plan = df.plan + val optimizedPlan = df.optimizedPlan + assert(plan eq optimizedPlan) + val planStats = collectPlanStats(plan) + assert(planStats.numDuplicatePlanIds == 0) + assert(planStats.numDuplicateRelations == 0) + } + + private def checkDeduplication( + df: Dataset[_], + numRelationsReduction: Int, + sizeReduction: Long): Unit = { + val plan = df.plan + val optimizedPlan = df.optimizedPlan + assert(plan != optimizedPlan) + + val planStats = collectPlanStats(plan) + assert(planStats.numDuplicatePlanIds > 0) + assert(planStats.numDuplicateRelations > 0) + + // An optimized plan should contain all the plan ids of the original plan. + val optimizedPlanStats = collectPlanStats(optimizedPlan) + assert(optimizedPlan.getRoot.hasWithRelations) + assert(planStats.planIds.equals(optimizedPlanStats.planIds - PlanId(optimizedPlan.getRoot))) + + // Idempotency. Once optimized there should not be any optimization opportunity left. + assert(optimizedPlanStats.numDuplicatePlanIds == 0) + assert(optimizedPlanStats.numDuplicateRelations == 0) + assert(PlanOptimizer.optimize(optimizedPlan, () => 0L) eq optimizedPlan) + + // Relations reduction. + assert(planStats.numRelations == optimizedPlanStats.numRelations + numRelationsReduction) + + // Size reduction. + val actualSizeReduction = plan.getSerializedSize - optimizedPlan.getSerializedSize + assert(actualSizeReduction == sizeReduction, + s"Actual reduction in plan size does not match expected reduction in plan: " + + s"$actualSizeReduction != $sizeReduction") + } + + test("un-optimizable plan remains unchanged - leafs") { + checkNoDeduplication(spark.range(10)) + // checkNoDeduplication(spark.sql("select 1")) + checkNoDeduplication(spark.emptyDataset[(Long, Long)]) + checkNoDeduplication(spark.read.format("parquet").load("s3://my-bucket/my-dir")) + checkNoDeduplication(spark.newDataFrame(_ => ())) + checkNoDeduplication(spark.newDataFrame(_.getUnknownBuilder)) + checkNoDeduplication(spark.newDataFrame(_.getCachedLocalRelationBuilder.setHash("1234"))) + checkNoDeduplication(spark.newDataFrame(_.getCachedRemoteRelationBuilder.setRelationId("rel1"))) + checkNoDeduplication(spark.newDataFrame(_.setReferencedPlanId(1))) + checkNoDeduplication(spark.newDataFrame(_.setExtension(PAny.pack(spark.range(10).plan)))) + checkNoDeduplication(spark.newDataFrame { + _.getCommonInlineUserDefinedDataSourceBuilder.setName("noop") + }) + checkNoDeduplication(spark.newDataFrame { + _.getCommonInlineUserDefinedTableFunctionBuilder.setFunctionName("noop") + }) + checkNoDeduplication(spark.newDataFrame { + _.getUnresolvedTableValuedFunctionBuilder.setFunctionName("noop") + }) + checkNoDeduplication(spark.newDataFrame { + _.getCatalogBuilder.getListCatalogsBuilder.setPattern("tbl*") + }) + } + + test("un-optimizable plan remains unchanged - unary") { + val input = spark.range(10) + val id = col("id") + checkNoDeduplication(input.select((id + 1).as("plus1"))) + checkNoDeduplication(input.filter(id > 1)) + checkNoDeduplication(input.sort(id.desc)) + checkNoDeduplication(input.limit(2)) + checkNoDeduplication(input.groupBy(id).count()) + checkNoDeduplication(input.sample(0.5)) + checkNoDeduplication(input.offset(3)) + checkNoDeduplication(input.dropDuplicates().as("q")) + checkNoDeduplication(input.repartition(3)) + checkNoDeduplication(input.repartition(id)) + checkNoDeduplication(input.toDF("id")) + checkNoDeduplication(input.withColumnRenamed("id", "di")) + checkNoDeduplication(input.drop("id")) + checkNoDeduplication(input.withColumn("id_plus1", id + 1)) + checkNoDeduplication(input.hint("broadcast")) + checkNoDeduplication(input.to(new StructType().add("id", "string"))) + checkNoDeduplication(input.mapPartitions(_.map(_.toLong))) + checkNoDeduplication(input.select(id, (id / 2).as("d2"), (id * 2).as("m2")) + .unpivot(Array(id), Array(col("d2"), col("m2")), "var", "val")) + checkNoDeduplication(input.withColumn("grp", id % 2).transpose(col("grp"))) + checkNoDeduplication(input.observe("simple", min(id), max(id))) + checkNoDeduplication(spark.read.csv(input.map(i => s"$i,$i")(Encoders.STRING))) + checkNoDeduplication(input.withWatermark("id", "1 minute")) + checkNoDeduplication(input.describe("id")) + checkNoDeduplication(input.summary("max", "min")) + checkNoDeduplication(input.withColumn("b", id).stat.crosstab("id", "b")) + checkNoDeduplication(input.stat.freqItems(Array("id"))) + checkNoDeduplication(input.stat.sampleBy(id, Map(0L -> 0.03, 1L -> 0.02), 33L)) + checkNoDeduplication(input.na.drop()) + checkNoDeduplication(input.na.fill(true)) + checkNoDeduplication(input.na.replace("id", Map(0L -> 1L))) + + // Manual ones... + checkNoDeduplication(spark.newDataFrame { + _.getShowStringBuilder.setInput(input.plan.getRoot) + .setNumRows(10) + .setTruncate(20) + .setVertical(false) + }) + checkNoDeduplication(spark.newDataFrame { + _.getHtmlStringBuilder.setInput(input.plan.getRoot).setNumRows(10).setTruncate(20) + }) + checkNoDeduplication(spark.newDataFrame { + _.getTailBuilder.setInput(input.plan.getRoot).setLimit(4) + }) + checkNoDeduplication(spark.newDataFrame { + _.getCovBuilder.setInput(input.plan.getRoot).setCol1("a").setCol2("b") + }) + checkNoDeduplication(spark.newDataFrame { + _.getCorrBuilder.setInput(input.plan.getRoot).setCol1("a").setCol2("b") + }) + checkNoDeduplication(spark.newDataFrame { + _.getApplyInPandasWithStateBuilder.setInput(input.plan.getRoot) + .addGroupingExpressions(toExpr(id)) + }) + checkNoDeduplication(spark.newDataFrame { + _.getApproxQuantileBuilder.setInput(input.plan.getRoot) + .addCols("id") + .addProbabilities(0.1).addProbabilities(0.2) + .setRelativeError(0.01) + }) + checkNoDeduplication(spark.newDataFrame { builder => + val transform = builder.getMlRelationBuilder.getTransformBuilder + .setInput(input.plan.getRoot) + transform.getTransformerBuilder + .setName("oneHotEncoder") + .setType(proto.MlOperator.OperatorType.OPERATOR_TYPE_TRANSFORMER) + }) + } + + private def testBinaryOperationDeduplication( + name: String, + sizeReduction1: Int, + sizeReduction2: Int)( + f: ((DataFrame, Column), (DataFrame, Column)) => Dataset[_]): Unit = { + test("optimize plan with duplicated relations - " + name) { + val left = spark.range(10).as("a").toDF() + val right = spark.range(11).as("b").toDF() + // No deduplication. + val df1 = f((left, left("id")), (right, right("id"))) + checkNoDeduplication(df1) + // Deduplication + val df2 = f((left, left("id")), (left, left("id"))) + checkDeduplication(df2, numRelationsReduction = -1, sizeReduction = sizeReduction1) + // Deeper tree + val df3 = f((df2.toDF(), df2("id")), (left, left("id"))) + checkDeduplication(df3, numRelationsReduction = 0, sizeReduction = sizeReduction2) + } + } + + testBinaryOperationDeduplication("join", 5, 26) { + case ((left, leftKey), (right, rightKey)) => + left.join(right, leftKey === rightKey) + } + + testBinaryOperationDeduplication("lateralJoin", 5, 26) { + case ((left, leftKey), (right, rightKey)) => + left.lateralJoin(right, leftKey === rightKey) + } + + testBinaryOperationDeduplication("union", 7, 28) { + case ((left, _), (right, _)) => + left.union(right) + } + + testBinaryOperationDeduplication("intersect", 7, 28) { + case ((left, _), (right, _)) => + left.intersect(right) + } + + testBinaryOperationDeduplication("except", 7, 28) { + case ((left, _), (right, _)) => + left.except(right) + } + + testBinaryOperationDeduplication("subquery - exists", 5, 26) { + case ((left, leftKey), (right, rightKey)) => + left.filter(right.filter(rightKey === leftKey).exists()) + } + + testBinaryOperationDeduplication("subquery - scalar", 5, 26) { + case ((left, _), (right, rightKey)) => + left.select(right.agg(min(rightKey)).scalar()) + } + + testBinaryOperationDeduplication("subquery - in", 5, 26) { + case ((left, leftKey), (right, _)) => + left.filter(!leftKey.isin(right)) + } + + testBinaryOperationDeduplication("groupMap", 5, 24) { + case ((left, leftKey), (right, rightKey)) => + val initialState = right.groupBy(rightKey).as[Long, Long] + left.groupBy(leftKey).as[Long, Long] + .mapGroupsWithState(GroupStateTimeout.EventTimeTimeout(), initialState) { + (key: Long, values: Iterator[Long], state: GroupState[Long]) => + (key, values.sum + state.get) + } + } + + testBinaryOperationDeduplication("coGroup", 5, 26) { + case ((left, leftKey), (right, rightKey)) => + val leftKv = left.groupBy(leftKey).as[Long, Long] + val rightKv = right.groupBy(rightKey).as[Long, Long] + leftKv.cogroup(rightKv) { + (key: Long, leftValues: Iterator[Long], rightValues: Iterator[Long]) => + leftValues.zipAll(rightValues, 0L, 0L).map { lr => + (key, lr._1 + lr._2) + } + } + } + + test("optimize plan with duplicated relations - asOfJoin") { + val input = spark.range(10).as("x") + val id = ColumnNodeToProtoConverter.toExpr(input("id")) + val relation = input.plan.getRoot + val df = spark.newDataFrame { builder => + builder.getAsOfJoinBuilder + .setLeft(relation) + .setLeftAsOf(id) + .setRight(relation) + .setRightAsOf(id) + .setDirection("backward") + .setAllowExactMatches(true) + } + checkDeduplication(df, numRelationsReduction = -1, sizeReduction = 5) + } + + test("optimize plan with duplicated relations - MLRelation - fetch") { + val input = spark.range(10).as("x") + val other = spark.read.format("parquet").load() + val lit = proto.Expression.Literal.newBuilder().setLong(11L).build() + val df = spark.newDataFrame { builder => + val fetch = builder.getMlRelationBuilder.getFetchBuilder + fetch.getObjRefBuilder.setId("21345") + fetch.addMethodsBuilder().setMethod("discombobulate") + .addArgs(proto.Fetch.Method.Args.newBuilder().setParam(lit)) + .addArgs(proto.Fetch.Method.Args.newBuilder().setInput(input.plan.getRoot)) + fetch.addMethodsBuilder().setMethod("fluster") + .addArgs(proto.Fetch.Method.Args.newBuilder().setInput(other.plan.getRoot)) + .addArgs(proto.Fetch.Method.Args.newBuilder().setInput(input.plan.getRoot)) + } + checkDeduplication(df, numRelationsReduction = -1, sizeReduction = 8) + } + + test("optimize plan with duplicated relations - subquery WithRelations rewrite") { + val input1 = spark.range(10) + val input2 = spark.emptyDataset[Long] + val input3 = spark.range(1, 1, 1).as("ref") + val df = input1.union(input3).filter( + col("id").isin(input1) && + col("id").isin(input2) && + col("id").isin(input3)) + checkDeduplication(df, numRelationsReduction = -1, sizeReduction = 16) + + // Check if the original WithRelations node is retained and has the proper references. + val root = df.optimizedPlan.getRoot + RelationTreeUtils.visit(root) { relation => + if (relation.hasWithRelations && (relation ne root)) { + val withRelations = relation.getWithRelations + assert(PlanId(df.plan.getRoot) == PlanId(relation)) + assert(withRelations.getReferencesCount == 2) + assert(withRelations.getReferences(0) eq input2.plan.getRoot) + assert(withRelations.getReferences(1).hasReferencedPlanId) + assert(withRelations.getReferences(1).getReferencedPlanId == PlanId(input3.plan.getRoot)) + } + true + } + } + + test("optimize plan with duplicated relations - 64 duplicate local relations") { + // Manually build a local relation. + val input = spark.newDataFrame { builder => + val schema = new StructType() + .add("key", "long") + .add("value", "binary") + val rng = new Random(61209389765L) + val allocator = new RootAllocator() + val byteString = try { + ArrowSerializer.serialize( + Iterator.tabulate(128) { i => + i.toLong -> rng.nextBytes(1024) + }, + agnosticEncoderFor(Encoders.tuple(longEncoder, Encoders.BINARY)), + allocator, + timeZoneId = TimeZone.getDefault.toString, + largeVarTypes = false) + } finally { + allocator.close() + } + builder.getLocalRelationBuilder.setSchema(schema.json).setData(byteString) + } + + // Build a tree with massive duplication. It will contain 64 duplicate local relations. + val df = Iterator.range(0, 6).foldLeft(input) { + case (current, _) => current.union(current) + } + // Optimization reduces size by 98.4% + checkDeduplication(df, numRelationsReduction = 107, sizeReduction = 8396068) + } + + private def join(input: Dataset[_], numJoins: Int): DataFrame = { + Iterator.fill(numJoins + 1)(input.toDF()).reduce(_.join(_)) + } + + test("optimize can increase number of relations") { + // Optimize can increase the number of relations in a plan. This happens when the number of + // relations removed does not offset the addition of the references and the WithRelations node. + + // A single relation duplicated subtree. Optimization always adds two relations + val input = spark.range(10) + checkDeduplication(join(input, 1), numRelationsReduction = -2, sizeReduction = -5) + checkDeduplication(join(input, 2), numRelationsReduction = -2, sizeReduction = 4) + checkDeduplication(join(input, 3), numRelationsReduction = -2, sizeReduction = 13) + checkDeduplication(join(input, 4), numRelationsReduction = -2, sizeReduction = 22) + checkDeduplication(join(input, 5), numRelationsReduction = -2, sizeReduction = 32) + + // A 2-relation duplicated subtree. Optimization only adds a relation if there is a single + // relation with 2 duplicates in the tree. + val input2 = input.as("a") + checkDeduplication(join(input2, 1), numRelationsReduction = -1, sizeReduction = 7) + checkDeduplication(join(input2, 2), numRelationsReduction = 0, sizeReduction = 28) + checkDeduplication(join(input2, 3), numRelationsReduction = 1, sizeReduction = 50) + checkDeduplication(join(input2, 4), numRelationsReduction = 2, sizeReduction = 73) + checkDeduplication(join(input2, 5), numRelationsReduction = 3, sizeReduction = 96) + + // A 3-relation duplicated subtree. Optimization always reduces the number of relations. + val input3 = input2.select(col("id")) + checkDeduplication(join(input3, 1), numRelationsReduction = 0, sizeReduction = 209) + checkDeduplication(join(input3, 2), numRelationsReduction = 2, sizeReduction = 434) + checkDeduplication(join(input3, 3), numRelationsReduction = 4, sizeReduction = 659) + } + + test("optimize can increase the size of the plan") { + // Optimize can increase the size of a plan. This happens when the size of the references and + // the withRelations node is larger than the sum of the deduplicated relations. + + // An unknown relation is tiny (7-16 bytes depending on planId). That is smaller than the + // WithRelations node (6-19 bytes depending on the withRelations planId, and the size of root + // plan) and the two references (per reference 3-13 bytes depending on planId) added by the + // optimization; as a result the size increases. + val input1 = spark.newDataFrame { builder => + builder.getUnknownBuilder + } + + // A single relation duplicated subtree. Optimization initially increases size. + checkDeduplication(join(input1, 1), numRelationsReduction = -2, sizeReduction = -10) + checkDeduplication(join(input1, 2), numRelationsReduction = -2, sizeReduction = -6) + checkDeduplication(join(input1, 3), numRelationsReduction = -2, sizeReduction = -2) + checkDeduplication(join(input1, 4), numRelationsReduction = -2, sizeReduction = 2) + + // A 2-relation duplicated subtree. Optimization always reduces size. + val input2 = input1.as("a") + checkDeduplication(join(input2, 1), numRelationsReduction = -1, sizeReduction = 2) + checkDeduplication(join(input2, 2), numRelationsReduction = 0, sizeReduction = 18) + checkDeduplication(join(input2, 3), numRelationsReduction = 1, sizeReduction = 34) + } +} diff --git a/sql/connect/common/src/main/protobuf/spark/connect/relations.proto b/sql/connect/common/src/main/protobuf/spark/connect/relations.proto index ccb674e812dc0..0d6cec38fcce8 100644 --- a/sql/connect/common/src/main/protobuf/spark/connect/relations.proto +++ b/sql/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -81,6 +81,14 @@ message Relation { UnresolvedTableValuedFunction unresolved_table_valued_function = 43; LateralJoin lateral_join = 44; + // Reference to a node else where in the tree. There are two use cases for this: + // 1. Reduce tree duplication. In this case the tree contains two or more subtrees that are + // identical. The referenced plan can only be a back reference, to a subtree that was + // already visited by the planner. The planner is expected to visit the tree bottom-up from + // left to right. + // 1. Reduce tree depth. + int64 referenced_plan_id = 45; + // NA functions NAFill fill_na = 90; NADrop drop_na = 91; diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriter.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriter.scala index 2038037d4439c..5964459d146ee 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriter.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriter.scala @@ -96,7 +96,7 @@ final class DataFrameWriter[T] private[sql] (ds: Dataset[T]) extends sql.DataFra private def executeWriteOperation(f: proto.WriteOperation.Builder => Unit): Unit = { val builder = proto.WriteOperation.newBuilder() - builder.setInput(ds.plan.getRoot) + builder.setInput(ds.optimizedPlan.getRoot) // Set path or table f(builder) diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriterV2.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriterV2.scala index 06d339487bfb8..1d3ce623c820f 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriterV2.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataFrameWriterV2.scala @@ -37,7 +37,7 @@ final class DataFrameWriterV2[T] private[sql] (table: String, ds: Dataset[T]) private val builder = proto.WriteOperationV2 .newBuilder() - .setInput(ds.plan.getRoot) + .setInput(ds.optimizedPlan.getRoot) .setTableName(table) /** @inheritdoc */ diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataStreamWriter.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataStreamWriter.scala index a42a463e2c42a..e6bc9ca374051 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataStreamWriter.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/DataStreamWriter.scala @@ -195,5 +195,5 @@ final class DataStreamWriter[T] private[sql] (ds: Dataset[T]) private val sinkBuilder = WriteStreamOperationStart .newBuilder() - .setInput(ds.plan.getRoot) + .setInput(ds.optimizedPlan.getRoot) } diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala index ec169ba114a3d..8ce08f5d81c20 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/Dataset.scala @@ -147,6 +147,8 @@ class Dataset[T] private[sql] ( private[sql] val agnosticEncoder: AgnosticEncoder[T] = agnosticEncoderFor(encoder) + private[sql] lazy val optimizedPlan = sparkSession.optimizer.optimize(plan) + override def toString: String = { try { val builder = new mutable.StringBuilder @@ -211,7 +213,7 @@ class Dataset[T] private[sql] ( DataTypeProtoConverter .toCatalystType( sparkSession - .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA) + .analyze(optimizedPlan, proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA) .getSchema .getSchema) .asInstanceOf[StructType] @@ -234,7 +236,7 @@ class Dataset[T] private[sql] ( // scalastyle:off println println( sparkSession - .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.EXPLAIN, Some(mode)) + .analyze(optimizedPlan, proto.AnalyzePlanRequest.AnalyzeCase.EXPLAIN, Some(mode)) .getExplain .getExplainString) // scalastyle:on println @@ -242,7 +244,7 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ def isLocal: Boolean = sparkSession - .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.IS_LOCAL) + .analyze(optimizedPlan, proto.AnalyzePlanRequest.AnalyzeCase.IS_LOCAL) .getIsLocal .getIsLocal @@ -253,7 +255,7 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ def isStreaming: Boolean = sparkSession - .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.IS_STREAMING) + .analyze(optimizedPlan, proto.AnalyzePlanRequest.AnalyzeCase.IS_STREAMING) .getIsStreaming .getIsStreaming @@ -815,7 +817,7 @@ class Dataset[T] private[sql] ( protected def createTempView(viewName: String, replace: Boolean, global: Boolean): Unit = { val command = sparkSession.newCommand { builder => builder.getCreateDataframeViewBuilder - .setInput(plan.getRoot) + .setInput(optimizedPlan.getRoot) .setName(viewName) .setIsGlobal(global) .setReplace(replace) @@ -1054,7 +1056,7 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ def inputFiles: Array[String] = sparkSession - .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.INPUT_FILES) + .analyze(optimizedPlan, proto.AnalyzePlanRequest.AnalyzeCase.INPUT_FILES) .getInputFiles .getFilesList .asScala @@ -1092,7 +1094,7 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ def persist(): this.type = { sparkSession.analyze { builder => - builder.getPersistBuilder.setRelation(plan.getRoot) + builder.getPersistBuilder.setRelation(optimizedPlan.getRoot) } this } @@ -1101,7 +1103,7 @@ class Dataset[T] private[sql] ( def persist(newLevel: StorageLevel): this.type = { sparkSession.analyze { builder => builder.getPersistBuilder - .setRelation(plan.getRoot) + .setRelation(optimizedPlan.getRoot) .setStorageLevel(StorageLevelProtoConverter.toConnectProtoType(newLevel)) } this @@ -1111,7 +1113,7 @@ class Dataset[T] private[sql] ( def unpersist(blocking: Boolean): this.type = { sparkSession.analyze { builder => builder.getUnpersistBuilder - .setRelation(plan.getRoot) + .setRelation(optimizedPlan.getRoot) .setBlocking(blocking) } this @@ -1125,7 +1127,7 @@ class Dataset[T] private[sql] ( StorageLevelProtoConverter.toStorageLevel( sparkSession .analyze { builder => - builder.getGetStorageLevelBuilder.setRelation(plan.getRoot) + builder.getGetStorageLevelBuilder.setRelation(optimizedPlan.getRoot) } .getGetStorageLevel .getStorageLevel) @@ -1170,7 +1172,7 @@ class Dataset[T] private[sql] ( val checkpointBuilder = builder.getCheckpointCommandBuilder .setLocal(!reliableCheckpoint) .setEager(eager) - .setRelation(this.plan.getRoot) + .setRelation(this.optimizedPlan.getRoot) storageLevel.foreach { storageLevel => checkpointBuilder.setStorageLevel( StorageLevelProtoConverter.toConnectProtoType(storageLevel)) @@ -1197,13 +1199,13 @@ class Dataset[T] private[sql] ( /** @inheritdoc */ @DeveloperApi def sameSemantics(other: sql.Dataset[T]): Boolean = { - sparkSession.sameSemantics(this.plan, other.plan) + sparkSession.sameSemantics(this.optimizedPlan, other.optimizedPlan) } /** @inheritdoc */ @DeveloperApi def semanticHash(): Int = { - sparkSession.semanticHash(this.plan) + sparkSession.semanticHash(this.optimizedPlan) } /** @inheritdoc */ @@ -1212,10 +1214,10 @@ class Dataset[T] private[sql] ( } private[sql] def analyze: proto.AnalyzePlanResponse = { - sparkSession.analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA) + sparkSession.analyze(optimizedPlan, proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA) } - def collectResult(): SparkResult[T] = sparkSession.execute(plan, agnosticEncoder) + def collectResult(): SparkResult[T] = sparkSession.execute(optimizedPlan, agnosticEncoder) private[sql] def withResult[E](f: SparkResult[T] => E): E = { val result = collectResult() diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/MergeIntoWriter.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/MergeIntoWriter.scala index 66354e63ca8af..880c64aab7b9f 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/MergeIntoWriter.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/MergeIntoWriter.scala @@ -49,7 +49,7 @@ class MergeIntoWriter[T] private[sql] (table: String, ds: Dataset[T], on: Column private val builder = MergeIntoTableCommand .newBuilder() .setTargetTableName(table) - .setSourceTablePlan(ds.plan.getRoot) + .setSourceTablePlan(ds.optimizedPlan.getRoot) .setMergeCondition(toExpr(on)) /** diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/SparkSession.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/SparkSession.scala index 739b0318759e5..dda919e802d43 100644 --- a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/SparkSession.scala +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/SparkSession.scala @@ -50,7 +50,7 @@ import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{agnosticEncoderFor, BoxedLongEncoder, UnboundRowEncoder} import org.apache.spark.sql.connect.ColumnNodeToProtoConverter.toLiteral import org.apache.spark.sql.connect.ConnectConversions._ -import org.apache.spark.sql.connect.client.{ClassFinder, CloseableIterator, SparkConnectClient, SparkResult} +import org.apache.spark.sql.connect.client.{ClassFinder, CloseableIterator, PlanOptimizer, SparkConnectClient, SparkResult} import org.apache.spark.sql.connect.client.SparkConnectClient.Configuration import org.apache.spark.sql.connect.client.arrow.ArrowSerializer import org.apache.spark.sql.internal.{SessionState, SharedState, SqlApiConf, SubqueryExpression} @@ -85,6 +85,7 @@ class SparkSession private[sql] ( private[this] val allocator = new RootAllocator() private[sql] lazy val cleaner = new SessionCleaner(this) + private[sql] val optimizer = new PlanOptimizer(planIdGenerator) // a unique session ID for this session from client. private[sql] def sessionId: String = client.sessionId diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/PlanOptimizer.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/PlanOptimizer.scala new file mode 100644 index 0000000000000..72b404bb8beff --- /dev/null +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/PlanOptimizer.scala @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.util.concurrent.atomic.AtomicLong + +import scala.collection.immutable.SeqMap +import scala.collection.mutable + +import org.apache.spark.connect.proto + +/** + * Optimizer for Spark Connect plans. This optimizer moves all duplicate subtrees from a query tree + * (Relation) into a top level WithRelations node, the duplicates in the plan are replaced by + * references. This has a couple of advantages: it reduces the number of nodes in the plan, it + * reduces the plan size, it avoids redundant work on the server side (both during planning, and - + * if supported - analysis). + * + * This optimization assumes that nodes with the same plan_id are structurally equivalent. + * + * The optimization will retain all plan_ids in the input plan. This is needed because plan_ids can + * be referenced by UnresolvedAttribute, UnresolvedStar, UnresolvedRegex, and SubqueryExpression + * expressions. If the plan can be optimized, the new plan will contain an additional plan_id: the + * plan_id of the top-level WithRelations node. + * + * The current optimization uses a 2-pass approach. The first step identifies duplicate subtrees. + * This has a runtime and space complexity of O(num_unique_relations). The second step rewrites the + * plan. This has a runtime and space complexity of O(num_unique_relations). + * + * In theory this can be implemented as a single pass algorithm by replace duplicates with a + * reference once we identify them. This has two downsides: it requires that the client and the + * server have exactly the same traversal order, and it makes the plans much harder to read. + * + * @param nextPlanId generator for new plan_ids. + */ +class PlanOptimizer(nextPlanId: () => Long) { + def this(planIdGenerator: AtomicLong) = + this(() => planIdGenerator.incrementAndGet()) + + /** + * Optimize the given plan by deduplicating subtrees. + * + * @param plan + * The plan to optimize. + * @return + * The optimized plan with deduplicated subtrees. If the plan cannot be optimized, this returns + * the original plan. + */ + def optimize(plan: proto.Plan): proto.Plan = + PlanOptimizer.optimize(plan, nextPlanId) + + /** + * Optimize the given relation by deduplicating subtrees. + * + * @param relation + * The relation to optimize. + * @return + * The optimized relation with deduplicated subtrees. If the relation cannot be optimized, this + * returns the original relation. + */ + def optimize(relation: proto.Relation): proto.Relation = + PlanOptimizer.optimize(relation, nextPlanId) +} + +private[connect] object PlanOptimizer { + import RelationTreeUtils._ + + def optimize(plan: proto.Plan, nextPlanId: () => Long): proto.Plan = { + if (plan.hasRoot) { + val relation = plan.getRoot + val optimizedRelation = optimize(relation, nextPlanId) + if (optimizedRelation ne relation) { + plan.toBuilder.setRoot(optimizedRelation).build() + } else { + plan + } + } else { + plan + } + } + + def optimize(relation: proto.Relation, nextPlanId: () => Long): proto.Relation = { + val relations = analyze(relation) + if (relations.nonEmpty) { + rewriteRelation(relation, relations, nextPlanId) + } else { + relation + } + } + + /** + * Find all repeated (duplicate) query fragments in a query tree. + * + * @param root node of the query tree + * @return a map that contains all repeated query fragments, keyed by their plan id. + */ + def analyze(root: proto.Relation): SeqMap[Long, proto.Relation] = { + // We can reduce memory consumption by using a bitset that tracks the planIds of nodes with a + // single occurrence. We only need to start tracking detailed information once there are + // multiple occurrences. For this we need a bitset that can deal with sparse planIds; there are + // libraries for this (e.g. RoaringBitMap), however that requires us to add a library to the + // Spark Connect client classpath which is something we need to trade off against overall size + // of that classpath. + val relationsMap = mutable.LinkedHashMap.empty[Long, RelationHolder] + visit(root) { + case relation @ PlanId(id) => + // Increase the stats for the plan id. If we have already seen the plan we will not + // visit its children, because we have already seen them before. + val holder = relationsMap.getOrElseUpdate(id, new RelationHolder(relation)) + holder.increaseNumOccurrences() == 1 + case _ => + // Always visit the subtree if there is no plan id. Its subtree might contain nodes we + // have not visited before. + true + } + + // Retain all relations that are duplicated. + relationsMap.to(SeqMap).collect { + case (id, holder) if holder.occurrences > 1 => + id -> holder.relation + } + } + + /** + * Rewrite the query tree using the map of reference relations. This transform moves all reference + * relations to a top-level WithRelations node, and replaces all instances of these relations with + * a reference. + * + * @param root relation to rewrite. + * @param referenceMap a map of relations that will be moved to the top-level withRelations node. + * @param nextPlanId function to generate the plan_id of the new root node. + * @return the rewritten plan. + */ + def rewriteRelation( + root: proto.Relation, + referenceMap: SeqMap[Long, proto.Relation], + nextPlanId: () => Long): proto.Relation = { + val builder = proto.Relation.newBuilder() + builder.getCommonBuilder.setPlanId(nextPlanId()) + val withRelationsBuilder = builder.getWithRelationsBuilder + val referencePlanIds = referenceMap.keySet + referenceMap.foreach { + case (id, reference) => + withRelationsBuilder.addReferences(rewriteSingleRelation( + reference, + referencePlanIds.filterNot(_ == id))) + } + withRelationsBuilder.setRoot(rewriteSingleRelation(root, referencePlanIds)) + builder.build() + } + + private def rewriteSingleRelation( + relation: proto.Relation, + referencePlanIds: Set[Long]): proto.Relation = transform(relation) { + case PlanId(id) if referencePlanIds(id) => + createReference(id) + case relation if relation.hasWithRelations => + // Rewrite the WithRelations node. We remove all reference plans if they are not a + // SubqueryAlias (the reference will be added to the top-level WithRelations node). We replace + // all references that are a SubqueryAlias with a reference. The latter is needed because + // WithRelations in combination with SubqueryAlias can be used to define named relations + // (like Common Table Expressions); names - unlike plan ids - are not unique. + val withRelations = relation.getWithRelations + val builder = relation.toBuilder + val withRelationsBuilder = builder.getWithRelationsBuilder.clearReferences() + withRelations.getReferencesList.forEach { + case reference @ PlanId(id) if referencePlanIds(id) => + if (reference.hasSubqueryAlias) { + withRelationsBuilder.addReferences(createReference(id)) + } + case reference => + withRelationsBuilder.addReferences(reference) + } + builder.build() + } + + private def createReference(planId: Long): proto.Relation = { + // We don't set a plan id here because this is a reference to an existing plan. + proto.Relation.newBuilder().setReferencedPlanId(planId).build() + } + + object PlanId { + def apply(relation: proto.Relation): Long = unapply(relation).get + def get(relation: proto.Relation): Option[Long] = unapply(relation) + def unapply(relation: proto.Relation): Option[Long] = { + val common = relation.getCommon + if (common.hasPlanId) { + Some(common.getPlanId) + } else { + None + } + } + } + + private class RelationHolder(val relation: proto.Relation) { + private var numOccurrences = 0 + def occurrences: Int = numOccurrences + def increaseNumOccurrences(): Int = { + numOccurrences += 1 + numOccurrences + } + } +} + + diff --git a/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RelationTreeUtils.scala b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RelationTreeUtils.scala new file mode 100644 index 0000000000000..9a2bea7753d1b --- /dev/null +++ b/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RelationTreeUtils.scala @@ -0,0 +1,491 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.util + +import scala.jdk.CollectionConverters._ + +import com.google.protobuf.Descriptors.FieldDescriptor +import com.google.protobuf.Message + +import org.apache.spark.connect.proto +import org.apache.spark.util.SparkEnvUtils + +/** + * Utility functions for visiting and transforming relation trees (a.k.a. query trees). + * + * This implementation is efficient for know Relation/Message types. For unknown message types we + * use proto reflection. + */ +private[connect] object RelationTreeUtils { + + private val NO_INPUT_REL_TYPE_CASES = { + val typeCases = util.EnumSet.noneOf(classOf[proto.Relation.RelTypeCase]) + typeCases.add(proto.Relation.RelTypeCase.RELTYPE_NOT_SET) + typeCases.add(proto.Relation.RelTypeCase.READ) + typeCases.add(proto.Relation.RelTypeCase.LOCAL_RELATION) + typeCases.add(proto.Relation.RelTypeCase.CACHED_LOCAL_RELATION) + typeCases.add(proto.Relation.RelTypeCase.CACHED_REMOTE_RELATION) + typeCases.add(proto.Relation.RelTypeCase.SQL) + typeCases.add(proto.Relation.RelTypeCase.RANGE) + typeCases.add(proto.Relation.RelTypeCase.COMMON_INLINE_USER_DEFINED_TABLE_FUNCTION) + typeCases.add(proto.Relation.RelTypeCase.COMMON_INLINE_USER_DEFINED_DATA_SOURCE) + typeCases.add(proto.Relation.RelTypeCase.UNRESOLVED_TABLE_VALUED_FUNCTION) + typeCases.add(proto.Relation.RelTypeCase.REFERENCED_PLAN_ID) + typeCases.add(proto.Relation.RelTypeCase.UNKNOWN) + typeCases.add(proto.Relation.RelTypeCase.CATALOG) + typeCases.add(proto.Relation.RelTypeCase.EXTENSION) + typeCases + } + + def visit(relation: proto.Relation)(f: proto.Relation => Boolean): Unit = { + visit[Null](relation, null) { (current, _) => + (f(current), null) + } + } + + /** + * Visit all [[proto.Relation relations]] in a tree. + * + * @param relation root of the tree. + * @param f visit callback. The children of a relation will be visited when this function + * returns true. + */ + def visit[T]( + relation: proto.Relation, context: T)( + f: (proto.Relation, T) => (Boolean, T)): Unit = { + val messages = new util.ArrayDeque[(Message, T)] + messages.push(relation -> context) + while (!messages.isEmpty) { + val (message, context) = messages.pop() + visitSingleMessage( + message, + context, (m, c: T) => messages.push(m -> c), + f) + } + } + + private def visitSingleMessage[T]( + message: Message, + context: T, + addMessage: (Message, T) => Unit, + f: (proto.Relation, T) => (Boolean, T)): Unit = { + message match { + case relation: proto.Relation => + val (continue, newContext) = f(relation, context) + def addRelation(next: proto.Relation): Unit = { + if (next ne proto.Relation.getDefaultInstance) { + addMessage(next, newContext) + } + } + if (continue) { + // TODO check that this is compiled into a table switch... + relation.getRelTypeCase match { + // Relations without inputs. + case relTypeCase if NO_INPUT_REL_TYPE_CASES.contains(relTypeCase) => + + // Single input relations. + case proto.Relation.RelTypeCase.PROJECT => + addRelation(relation.getProject.getInput) + case proto.Relation.RelTypeCase.FILTER => + addRelation(relation.getFilter.getInput) + case proto.Relation.RelTypeCase.SORT => + addRelation(relation.getSort.getInput) + case proto.Relation.RelTypeCase.LIMIT => + addRelation(relation.getLimit.getInput) + case proto.Relation.RelTypeCase.AGGREGATE => + addRelation(relation.getAggregate.getInput) + case proto.Relation.RelTypeCase.SAMPLE => + addRelation(relation.getSample.getInput) + case proto.Relation.RelTypeCase.OFFSET => + addRelation(relation.getOffset.getInput) + case proto.Relation.RelTypeCase.DEDUPLICATE => + addRelation(relation.getDeduplicate.getInput) + case proto.Relation.RelTypeCase.SUBQUERY_ALIAS => + addRelation(relation.getSubqueryAlias.getInput) + case proto.Relation.RelTypeCase.REPARTITION => + addRelation(relation.getRepartition.getInput) + case proto.Relation.RelTypeCase.TO_DF => + addRelation(relation.getToDf.getInput) + case proto.Relation.RelTypeCase.WITH_COLUMNS_RENAMED => + addRelation(relation.getWithColumnsRenamed.getInput) + case proto.Relation.RelTypeCase.SHOW_STRING => + addRelation(relation.getShowString.getInput) + case proto.Relation.RelTypeCase.DROP => + addRelation(relation.getDrop.getInput) + case proto.Relation.RelTypeCase.TAIL => + addRelation(relation.getTail.getInput) + case proto.Relation.RelTypeCase.WITH_COLUMNS => + addRelation(relation.getWithColumns.getInput) + case proto.Relation.RelTypeCase.HINT => + addRelation(relation.getHint.getInput) + case proto.Relation.RelTypeCase.UNPIVOT => + addRelation(relation.getUnpivot.getInput) + case proto.Relation.RelTypeCase.TO_SCHEMA => + addRelation(relation.getToSchema.getInput) + case proto.Relation.RelTypeCase.REPARTITION_BY_EXPRESSION => + addRelation(relation.getRepartitionByExpression.getInput) + case proto.Relation.RelTypeCase.MAP_PARTITIONS => + addRelation(relation.getMapPartitions.getInput) + case proto.Relation.RelTypeCase.COLLECT_METRICS => + addRelation(relation.getCollectMetrics.getInput) + case proto.Relation.RelTypeCase.PARSE => + addRelation(relation.getParse.getInput) + case proto.Relation.RelTypeCase.WITH_WATERMARK => + addRelation(relation.getWithWatermark.getInput) + case proto.Relation.RelTypeCase.APPLY_IN_PANDAS_WITH_STATE => + addRelation(relation.getApplyInPandasWithState.getInput) + case proto.Relation.RelTypeCase.HTML_STRING => + addRelation(relation.getHtmlString.getInput) + case proto.Relation.RelTypeCase.TRANSPOSE => + addRelation(relation.getTranspose.getInput) + case proto.Relation.RelTypeCase.FILL_NA => + addRelation(relation.getFillNa.getInput) + case proto.Relation.RelTypeCase.DROP_NA => + addRelation(relation.getDropNa.getInput) + case proto.Relation.RelTypeCase.REPLACE => + addRelation(relation.getReplace.getInput) + case proto.Relation.RelTypeCase.SUMMARY => + addRelation(relation.getSummary.getInput) + case proto.Relation.RelTypeCase.CROSSTAB => + addRelation(relation.getCrosstab.getInput) + case proto.Relation.RelTypeCase.DESCRIBE => + addRelation(relation.getDescribe.getInput) + case proto.Relation.RelTypeCase.COV => + addRelation(relation.getCov.getInput) + case proto.Relation.RelTypeCase.CORR => + addRelation(relation.getCorr.getInput) + case proto.Relation.RelTypeCase.APPROX_QUANTILE => + addRelation(relation.getApproxQuantile.getInput) + case proto.Relation.RelTypeCase.FREQ_ITEMS => + addRelation(relation.getFreqItems.getInput) + case proto.Relation.RelTypeCase.SAMPLE_BY => + addRelation(relation.getSampleBy.getInput) + + // Multi input relations + case proto.Relation.RelTypeCase.JOIN => + val join = relation.getJoin + addRelation(join.getLeft) + addRelation(join.getRight) + case proto.Relation.RelTypeCase.SET_OP => + val setOp = relation.getSetOp + addRelation(setOp.getLeftInput) + addRelation(setOp.getRightInput) + case proto.Relation.RelTypeCase.GROUP_MAP => + val groupMap = relation.getGroupMap + addRelation(groupMap.getInput) + addRelation(groupMap.getInitialInput) + case proto.Relation.RelTypeCase.CO_GROUP_MAP => + val coGroupMap = relation.getCoGroupMap + addRelation(coGroupMap.getInput) + addRelation(coGroupMap.getOther) + case proto.Relation.RelTypeCase.AS_OF_JOIN => + val asOfJoin = relation.getAsOfJoin + addRelation(asOfJoin.getLeft) + addRelation(asOfJoin.getRight) + case proto.Relation.RelTypeCase.WITH_RELATIONS => + val withRelations = relation.getWithRelations + withRelations.getReferencesList.forEach(addRelation(_)) + addRelation(withRelations.getRoot) + case proto.Relation.RelTypeCase.LATERAL_JOIN => + val lateralJoin = relation.getLateralJoin + addRelation(lateralJoin.getLeft) + addRelation(lateralJoin.getRight) + case proto.Relation.RelTypeCase.ML_RELATION => + val mlRelation = relation.getMlRelation + if (mlRelation.hasTransform) { + addRelation(mlRelation.getTransform.getInput) + } else if (mlRelation.hasFetch) { + mlRelation.getFetch.getMethodsList.forEach { method => + method.getArgsList.forEach { args => + if (args.hasInput) { + addRelation(args.getInput) + } + } + } + } + addRelation(mlRelation.getModelSummaryDataset) + + // Unhandled relation type. Fall back to proto reflection. + case relTypeCase => + assert(!SparkEnvUtils.isTesting, + "Unhandled relTypeCase: " + relTypeCase) + val descriptor = relation.getDescriptorForType + .findFieldByNumber(relTypeCase.getNumber) + if (descriptor != null && descriptor.getType == FieldDescriptor.Type.MESSAGE) { + addMessage(relation.getField(descriptor).asInstanceOf[Message], newContext) + } + } + } + + case message => + // Unknown message. Fall back to proto reflection. + assert(!SparkEnvUtils.isTesting, + "Unhandled Message type: " + message.getDescriptorForType.getName) + message.getAllFields.forEach { (desc, value) => + if (desc.getType == FieldDescriptor.Type.MESSAGE) { + value match { + case list: util.List[Message @unchecked] => + list.forEach(addMessage(_, context)) + case message: Message => + addMessage(message, context) + } + } + } + } + } + + /** + * Recursively transform a [[proto.Relation relation]]. + * + * @param message entry point. + * @param pf transformation to apply to all relations. + * @tparam M type of the current message. + * @return the transformed relation. + */ + private[connect] def transform[M <: Message]( + message: M)( + pf: PartialFunction[proto.Relation, proto.Relation]): M = { + def transformRelation(relation: proto.Relation, set: proto.Relation => Any): Unit = { + if (relation ne proto.Relation.getDefaultInstance) { + set(transform(relation)(pf)) + } + } + def transformMessage(value: Message, fd: FieldDescriptor, builder: Message.Builder): Unit = { + builder.setField(fd, transform(value)(pf)) + } + def result(builder: Message.Builder): M = { + val result = builder.build() + if (result == message) { + message + } else { + result.asInstanceOf[M] + } + } + + message match { + case relation: proto.Relation => + val transformed = pf.applyOrElse(relation, identity[proto.Relation]) + val builder = transformed.toBuilder + // Transform input relations. + builder.getRelTypeCase match { + // Relations without inputs. + case relTypeCase if NO_INPUT_REL_TYPE_CASES.contains(relTypeCase) => + + // Single input relations + case proto.Relation.RelTypeCase.PROJECT => + val typeCaseBuilder = builder.getProjectBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.FILTER => + val typeCaseBuilder = builder.getFilterBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.SORT => + val typeCaseBuilder = builder.getSortBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.LIMIT => + val typeCaseBuilder = builder.getLimitBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.AGGREGATE => + val typeCaseBuilder = builder.getAggregateBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.SAMPLE => + val typeCaseBuilder = builder.getSampleBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.OFFSET => + val typeCaseBuilder = builder.getOffsetBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.DEDUPLICATE => + val typeCaseBuilder = builder.getDeduplicateBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.SUBQUERY_ALIAS => + val typeCaseBuilder = builder.getSubqueryAliasBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.REPARTITION => + val typeCaseBuilder = builder.getRepartitionBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.TO_DF => + val typeCaseBuilder = builder.getToDfBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.WITH_COLUMNS_RENAMED => + val typeCaseBuilder = builder.getWithColumnsRenamedBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.SHOW_STRING => + val typeCaseBuilder = builder.getShowStringBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.DROP => + val typeCaseBuilder = builder.getDropBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.TAIL => + val typeCaseBuilder = builder.getTailBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.WITH_COLUMNS => + val typeCaseBuilder = builder.getWithColumnsBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.HINT => + val typeCaseBuilder = builder.getHintBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.UNPIVOT => + val typeCaseBuilder = builder.getUnpivotBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.TO_SCHEMA => + val typeCaseBuilder = builder.getToSchemaBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.REPARTITION_BY_EXPRESSION => + val typeCaseBuilder = builder.getRepartitionByExpressionBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.MAP_PARTITIONS => + val typeCaseBuilder = builder.getMapPartitionsBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.COLLECT_METRICS => + val typeCaseBuilder = builder.getCollectMetricsBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.PARSE => + val typeCaseBuilder = builder.getParseBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.WITH_WATERMARK => + val typeCaseBuilder = builder.getWithWatermarkBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.APPLY_IN_PANDAS_WITH_STATE => + val typeCaseBuilder = builder.getApplyInPandasWithStateBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.HTML_STRING => + val typeCaseBuilder = builder.getHtmlStringBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.TRANSPOSE => + val typeCaseBuilder = builder.getTransposeBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.FILL_NA => + val typeCaseBuilder = builder.getFillNaBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.DROP_NA => + val typeCaseBuilder = builder.getDropNaBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.REPLACE => + val typeCaseBuilder = builder.getReplaceBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.SUMMARY => + val typeCaseBuilder = builder.getSummaryBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.CROSSTAB => + val typeCaseBuilder = builder.getCrosstabBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.DESCRIBE => + val typeCaseBuilder = builder.getDescribeBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.COV => + val typeCaseBuilder = builder.getCovBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.CORR => + val typeCaseBuilder = builder.getCorrBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.APPROX_QUANTILE => + val typeCaseBuilder = builder.getApproxQuantileBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.FREQ_ITEMS => + val typeCaseBuilder = builder.getFreqItemsBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + case proto.Relation.RelTypeCase.SAMPLE_BY => + val typeCaseBuilder = builder.getSampleByBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + + // Multi-input relations. + case proto.Relation.RelTypeCase.JOIN => + val typeCaseBuilder = builder.getJoinBuilder + transformRelation(typeCaseBuilder.getLeft, typeCaseBuilder.setLeft) + transformRelation(typeCaseBuilder.getRight, typeCaseBuilder.setRight) + case proto.Relation.RelTypeCase.SET_OP => + val typeCaseBuilder = builder.getSetOpBuilder + transformRelation(typeCaseBuilder.getLeftInput, typeCaseBuilder.setLeftInput) + transformRelation(typeCaseBuilder.getRightInput, typeCaseBuilder.setRightInput) + case proto.Relation.RelTypeCase.GROUP_MAP => + val typeCaseBuilder = builder.getGroupMapBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + transformRelation(typeCaseBuilder.getInitialInput, typeCaseBuilder.setInitialInput) + case proto.Relation.RelTypeCase.CO_GROUP_MAP => + val typeCaseBuilder = builder.getCoGroupMapBuilder + transformRelation(typeCaseBuilder.getInput, typeCaseBuilder.setInput) + transformRelation(typeCaseBuilder.getOther, typeCaseBuilder.setOther) + case proto.Relation.RelTypeCase.AS_OF_JOIN => + val typeCaseBuilder = builder.getAsOfJoinBuilder + transformRelation(typeCaseBuilder.getLeft, typeCaseBuilder.setLeft) + transformRelation(typeCaseBuilder.getRight, typeCaseBuilder.setRight) + case proto.Relation.RelTypeCase.WITH_RELATIONS => + val typeCaseBuilder = builder.getWithRelationsBuilder + (0 until typeCaseBuilder.getReferencesCount).foreach { i => + transformRelation( + typeCaseBuilder.getReferences(i), + typeCaseBuilder.setReferences(i, _)) + } + transformRelation(typeCaseBuilder.getRoot, typeCaseBuilder.setRoot) + case proto.Relation.RelTypeCase.LATERAL_JOIN => + val typeCaseBuilder = builder.getLateralJoinBuilder + transformRelation(typeCaseBuilder.getLeft, typeCaseBuilder.setLeft) + transformRelation(typeCaseBuilder.getRight, typeCaseBuilder.setRight) + case proto.Relation.RelTypeCase.ML_RELATION => + val typeCaseBuilder = builder.getMlRelationBuilder + if (typeCaseBuilder.hasTransform) { + val transformBuilder = typeCaseBuilder.getTransformBuilder + transformRelation(transformBuilder.getInput, transformBuilder.setInput) + } else if (typeCaseBuilder.hasFetch) { + val fetchBuilder = typeCaseBuilder.getFetchBuilder + (0 until fetchBuilder.getMethodsCount).foreach { i => + val methodBuilder = fetchBuilder.getMethodsBuilder(i) + (0 until methodBuilder.getArgsCount).foreach { j => + val argsBuilder = methodBuilder.getArgsBuilder(j) + if (argsBuilder.hasInput) { + transformRelation(argsBuilder.getInput, argsBuilder.setInput) + } + } + } + } + transformRelation( + typeCaseBuilder.getModelSummaryDataset, + typeCaseBuilder.setModelSummaryDataset) + + // Unhandled relation type. Fall back to proto reflection. + case relTypeCase => + assert(!SparkEnvUtils.isTesting) + val descriptor = builder.getDescriptorForType + .findFieldByNumber(relTypeCase.getNumber) + if (descriptor != null && descriptor.getType == FieldDescriptor.Type.MESSAGE) { + val value = builder.getField(descriptor).asInstanceOf[Message] + transformMessage(value, descriptor, builder) + } + } + result(builder) + + case message => + // Unknown message type. Fall back to proto reflection. + val builder = message.toBuilder + message.getAllFields.forEach { (desc, value) => + if (desc.getType == FieldDescriptor.Type.MESSAGE) { + value match { + case list: util.List[Message @unchecked] => + list.asScala.zipWithIndex.foreach { + case (element, i) => + builder.setRepeatedField(desc, i, transform(element)(pf)) + } + case item: Message => + transformMessage(item, desc, builder) + } + } + } + result(builder) + } + } +} diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index bcd643a30253f..1b44ee2644c69 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.connect.planner -import java.util.Properties -import java.util.UUID +import java.util.{Properties, UUID} import scala.collection.mutable import scala.jdk.CollectionConverters._ @@ -45,24 +44,23 @@ import org.apache.spark.internal.{Logging, LogKeys, MDC} import org.apache.spark.internal.LogKeys.{DATAFRAME_ID, SESSION_ID} import org.apache.spark.resource.{ExecutorResourceRequest, ResourceProfile, TaskResourceProfile, TaskResourceRequest} import org.apache.spark.sql.{Column, Encoders, ForeachWriter, Observation, Row} -import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier, QueryPlanningTracker} -import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, GlobalTempView, LocalTempView, MultiAlias, NameParameterizedQuery, PosParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDataFrameStar, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedOrdinal, UnresolvedPlanId, UnresolvedRegex, UnresolvedRelation, UnresolvedStar, UnresolvedStarWithColumns, UnresolvedStarWithColumnsRenames, UnresolvedSubqueryColumnAliases, UnresolvedTableValuedFunction, UnresolvedTranspose} +import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier} +import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, GlobalTempView, LocalTempView, MultiAlias, NameParameterizedQuery, PosParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDataFrameStar, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedOrdinal, UnresolvedRegex, UnresolvedRelation, UnresolvedStar, UnresolvedStarWithColumns, UnresolvedStarWithColumnsRenames, UnresolvedSubqueryColumnAliases, UnresolvedTableValuedFunction, UnresolvedTranspose} import org.apache.spark.sql.catalyst.encoders.{encoderFor, AgnosticEncoder, ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ProductEncoder, RowEncoder => AgnosticRowEncoder, StringEncoder, UnboundRowEncoder} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.parser.{ParseException, ParserUtils} -import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, JoinType, LeftAnti, LeftOuter, LeftSemi, RightOuter, UsingJoin} -import org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.catalyst.plans.logical.{AppendColumns, Assignment, CoGroup, CollectMetrics, CommandResult, CompoundBody, Deduplicate, DeduplicateWithinWatermark, DeleteAction, DeserializeToObject, Except, FlatMapGroupsWithState, InsertAction, InsertStarAction, Intersect, JoinWith, LocalRelation, LogicalGroupState, LogicalPlan, MapGroups, MapPartitions, MergeAction, Project, Sample, SerializeFromObject, Sort, SubqueryAlias, TimeModes, TransformWithState, TypedFilter, Union, Unpivot, UnresolvedHint, UpdateAction, UpdateEventTimeWatermarkColumn, UpdateStarAction} +import org.apache.spark.sql.catalyst.plans.{logical, Cross, FullOuter, Inner, JoinType, LeftAnti, LeftOuter, LeftSemi, RightOuter, UsingJoin} +import org.apache.spark.sql.catalyst.plans.logical.{AppendColumns, Assignment, CoGroup, CollectMetrics, CommandResult, CompoundBody, Deduplicate, DeduplicateWithinWatermark, DeleteAction, DeserializeToObject, Except, FlatMapGroupsWithState, InsertAction, InsertStarAction, Intersect, JoinWith, LocalRelation, LogicalGroupState, LogicalPlan, MapGroups, MapPartitions, MergeAction, Project, Sample, SerializeFromObject, Sort, SubqueryAlias, TimeModes, TransformWithState, TypedFilter, Union, Unpivot, UnresolvedHint, UnresolvedWith, UpdateAction, UpdateEventTimeWatermarkColumn, UpdateStarAction} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes -import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin, TreePattern} +import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils} import org.apache.spark.sql.classic.{Catalog, Dataset, MergeIntoWriter, RelationalGroupedDataset, SparkSession, TypedAggUtils, UserDefinedFunctionUtils} import org.apache.spark.sql.classic.ClassicConversions._ import org.apache.spark.sql.connect.client.arrow.ArrowSerializer -import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, ForeachWriterPacket, LiteralValueProtoConverter, StorageLevelProtoConverter, StreamingListenerPacket, UdfPacket} +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, ForeachWriterPacket, InvalidPlanInput, LiteralValueProtoConverter, StorageLevelProtoConverter, StreamingListenerPacket, UdfPacket} import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_SIZE import org.apache.spark.sql.connect.ml.MLHandler import org.apache.spark.sql.connect.pipelines.PipelinesHandler @@ -120,6 +118,24 @@ class SparkConnectPlanner( private lazy val pythonExec = sys.env.getOrElse("PYSPARK_PYTHON", sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", "python3")) + private val RELATION_COMPUTATION_IN_PROGRESS = new AnyRef + private val relationCache = mutable.Map.empty[Long, AnyRef] + + private def getCachedRelation(planId: Long): LogicalPlan = { + relationCache.get(planId) match { + case Some(plan: LogicalPlan) => plan + case Some(RELATION_COMPUTATION_IN_PROGRESS) => + throw InvalidPlanInput(s"Cyclic plan reference for plan ID: $planId") + case Some(relation: proto.Relation) => + relationCache.put(planId, RELATION_COMPUTATION_IN_PROGRESS) + val plan = transformRelation(relation) + relationCache.update(planId, plan) + plan + case _ => + throw InvalidInputErrors.invalidWithRelationReference() + } + } + /** * The root of the query plan is a relation and we apply the transformations to it. The resolved * logical plan will not get cached. If the result needs to be cached, use @@ -221,6 +237,8 @@ class SparkConnectPlanner( case proto.Relation.RelTypeCase.COLLECT_METRICS => transformCollectMetrics(rel.getCollectMetrics, rel.getCommon.getPlanId) case proto.Relation.RelTypeCase.PARSE => transformParse(rel.getParse) + case proto.Relation.RelTypeCase.REFERENCED_PLAN_ID => + getCachedRelation(rel.getReferencedPlanId) // Catalog API (internal-only) case proto.Relation.RelTypeCase.CATALOG => transformCatalog(rel.getCatalog) @@ -344,13 +362,6 @@ class SparkConnectPlanner( } } - private def transformSqlWithRefs(query: proto.WithRelations): LogicalPlan = { - if (!isValidSQLWithRefs(query)) { - throw InvalidInputErrors.invalidSQLWithReferences(query) - } - executeSQLWithRefs(query).logicalPlan - } - private def transformSubqueryAlias(alias: proto.SubqueryAlias): LogicalPlan = { val aliasIdentifier = if (alias.getQualifierCount > 0) { @@ -2793,14 +2804,12 @@ class SparkConnectPlanner( .build() } - val df = relation.getRelTypeCase match { - case proto.Relation.RelTypeCase.SQL => - executeSQL(relation.getSql, tracker) - case proto.Relation.RelTypeCase.WITH_RELATIONS => - executeSQLWithRefs(relation.getWithRelations, tracker) - case other => - throw InvalidInputErrors.sqlCommandExpectsSqlOrWithRelations(other) + // Only allow a SQL relation or a SQL relation nested in a WithRelations relation. + if (!relation.hasSql && + !(relation.hasWithRelations && relation.getWithRelations.getRoot.hasSql)) { + throw InvalidInputErrors.sqlCommandExpectsSqlOrWithRelations(relation.getRelTypeCase) } + val df = Dataset.ofRows(session, transformRelation(relation), tracker) // Check if command or SQL Script has been executed. val isCommand = df.queryExecution.commandExecuted.isInstanceOf[CommandResult] @@ -2881,79 +2890,6 @@ class SparkConnectPlanner( } } - private def isValidSQLWithRefs(query: proto.WithRelations): Boolean = { - query.getRoot.getRelTypeCase match { - case proto.Relation.RelTypeCase.SQL => - case _ => return false - } - if (query.getReferencesCount == 0) { - return false - } - query.getReferencesList.iterator().asScala.foreach { ref => - ref.getRelTypeCase match { - case proto.Relation.RelTypeCase.SUBQUERY_ALIAS => - case _ => return false - } - } - true - } - - private def executeSQLWithRefs( - query: proto.WithRelations, - tracker: QueryPlanningTracker = new QueryPlanningTracker) = { - if (!isValidSQLWithRefs(query)) { - throw InvalidInputErrors.invalidSQLWithReferences(query) - } - - // Eagerly execute commands of the provided SQL string, with given references. - val sql = query.getRoot.getSql - this.synchronized { - try { - query.getReferencesList.asScala.foreach { ref => - Dataset - .ofRows(session, transformRelation(ref.getSubqueryAlias.getInput)) - .createOrReplaceTempView(ref.getSubqueryAlias.getAlias) - } - executeSQL(sql, tracker) - } finally { - // drop all temporary views - query.getReferencesList.asScala.foreach { ref => - session.catalog.dropTempView(ref.getSubqueryAlias.getAlias) - } - } - } - } - - private def executeSQL( - sql: proto.SQL, - tracker: QueryPlanningTracker = new QueryPlanningTracker) = { - // Eagerly execute commands of the provided SQL string. - val args = sql.getArgsMap - val namedArguments = sql.getNamedArgumentsMap - val posArgs = sql.getPosArgsList - val posArguments = sql.getPosArgumentsList - if (!namedArguments.isEmpty) { - session.sql( - sql.getQuery, - namedArguments.asScala.toMap.transform((_, e) => Column(transformExpression(e))), - tracker) - } else if (!posArguments.isEmpty) { - session.sql( - sql.getQuery, - posArguments.asScala.map(e => Column(transformExpression(e))).toArray, - tracker) - } else if (!args.isEmpty) { - session.sql( - sql.getQuery, - args.asScala.toMap.transform((_, v) => transformLiteral(v)), - tracker) - } else if (!posArgs.isEmpty) { - session.sql(sql.getQuery, posArgs.asScala.map(transformLiteral).toArray, tracker) - } else { - session.sql(sql.getQuery, Map.empty[String, Any], tracker) - } - } - private def handleRegisterUserDefinedFunction( fun: proto.CommonInlineUserDefinedFunction): Unit = { fun.getFunctionCase match { @@ -4057,72 +3993,73 @@ class SparkConnectPlanner( private def transformSubqueryExpression( getSubqueryExpression: proto.SubqueryExpression): Expression = { - val planId = getSubqueryExpression.getPlanId + val plan = getCachedRelation(getSubqueryExpression.getPlanId) getSubqueryExpression.getSubqueryType match { case proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_SCALAR => - UnresolvedScalarSubqueryPlanId(planId) + ScalarSubquery(plan) case proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_EXISTS => - UnresolvedExistsPlanId(planId) + Exists(plan) case proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_TABLE_ARG => if (getSubqueryExpression.hasTableArgOptions) { val options = getSubqueryExpression.getTableArgOptions - UnresolvedTableArgPlanId( - planId, - partitionSpec = options.getPartitionSpecList.asScala + FunctionTableSubqueryArgumentExpression( + plan, + partitionByExpressions = options.getPartitionSpecList.asScala .map(transformExpression) .toSeq, - orderSpec = options.getOrderSpecList.asScala + orderByExpressions = options.getOrderSpecList.asScala .map(transformSortOrder) .toSeq, withSinglePartition = options.hasWithSinglePartition && options.getWithSinglePartition) } else { - UnresolvedTableArgPlanId(planId) + FunctionTableSubqueryArgumentExpression(plan) } case proto.SubqueryExpression.SubqueryType.SUBQUERY_TYPE_IN => - UnresolvedInSubqueryPlanId( - getSubqueryExpression.getInSubqueryValuesList.asScala.map { value => - transformExpression(value) - }.toSeq, - planId) + val values = getSubqueryExpression.getInSubqueryValuesList.asScala.map { value => + transformExpression(value) + }.toSeq + InSubquery(values, ListQuery(plan)) case other => throw InvalidInputErrors.invalidEnum(other) } } private def transformWithRelations(getWithRelations: proto.WithRelations): LogicalPlan = { - if (isValidSQLWithRefs(getWithRelations)) { - transformSqlWithRefs(getWithRelations) + // Register the plans in the relation cache, so they can be resolved while + // transforming the root relation into a LogicalPlan. + val namedReferences = mutable.Buffer.empty[(String, proto.Relation)] + getWithRelations.getReferencesList.forEach { ref => + val common = ref.getCommon + if (!common.hasPlanId && !ref.hasSubqueryAlias) { + throw InvalidInputErrors.invalidWithRelationReference() + } + if (common.hasPlanId) { + relationCache.put(common.getPlanId, ref) + } + if (ref.hasSubqueryAlias) { + namedReferences += ref.getSubqueryAlias.getAlias -> ref + } + } + + val root = transformRelation(getWithRelations.getRoot) + if (namedReferences.nonEmpty) { + // If WithRelations contains named references we create a CTE. This is needed because it is + // allowed to nest WithRelations nodes and names used in a parent node can be reused + // (overwritten) by a child node. + val ctes = namedReferences.map { + case (name, relation) => + assert(relation.hasSubqueryAlias) + val plan = if (relation.getCommon.hasPlanId) { + getCachedRelation(relation.getCommon.getPlanId) + } else { + transformRelation(relation) + } + (name, plan.asInstanceOf[SubqueryAlias], None) + } + UnresolvedWith(root, ctes.toSeq) } else { // Wrap the plan to keep the original planId. - val plan = Project(Seq(UnresolvedStar(None)), transformRelation(getWithRelations.getRoot)) - - val relations = getWithRelations.getReferencesList.asScala.map { ref => - if (ref.hasCommon && ref.getCommon.hasPlanId) { - val planId = ref.getCommon.getPlanId - val plan = transformRelation(ref) - planId -> plan - } else { - throw InvalidInputErrors.invalidWithRelationReference() - } - }.toMap - - val missingPlanIds = mutable.Set.empty[Long] - val withRelations = plan - .transformAllExpressionsWithPruning(_.containsPattern(TreePattern.UNRESOLVED_PLAN_ID)) { - case u: UnresolvedPlanId => - if (relations.contains(u.planId)) { - u.withPlan(relations(u.planId)) - } else { - missingPlanIds += u.planId - u - } - } - assertPlan( - missingPlanIds.isEmpty, - "Missing relation in WithRelations: " + - s"${missingPlanIds.mkString("(", ", ", ")")} not in " + - s"${relations.keys.mkString("(", ", ", ")")}") - withRelations + Project(Seq(UnresolvedStar(None)), root) } }