2525from datahub .sdk .search_filters import Filter , FilterDsl , load_filters
2626from datahub .utilities .ordered_set import OrderedSet
2727from fastmcp import FastMCP
28- from pydantic import BaseModel
28+ from pydantic import BaseModel , Field
29+ from functools import lru_cache
2930
3031_P = ParamSpec ("_P" )
3132_R = TypeVar ("_R" )
@@ -173,7 +174,80 @@ def _clean_get_entity_response(raw_response: dict) -> dict:
173174 return response
174175
175176
176- @mcp .tool (description = "Get an entity by its DataHub URN." )
177+ class SemanticVersionStruct (BaseModel ):
178+ semantic_version : str = Field (alias = "semanticVersion" )
179+ version_stamp : str = Field (alias = "versionStamp" )
180+
181+
182+ class SchemaVersionList (BaseModel ):
183+ latest_version : SemanticVersionStruct
184+ versions : list [SemanticVersionStruct ]
185+
186+
187+ class DatasetSchemaAPI :
188+ def __init__ (self , graph : DataHubGraph ) -> None :
189+ self ._graph = graph
190+
191+ def get_schema_version_list (self , dataset_urn : str ) -> SchemaVersionList | None :
192+ variables = {
193+ "input" : {
194+ "datasetUrn" : dataset_urn ,
195+ }
196+ }
197+ resp = _execute_graphql (
198+ self ._graph ,
199+ query = entity_details_fragment_gql ,
200+ variables = variables ,
201+ operation_name = "getSchemaVersionList" ,
202+ )
203+
204+ if not (raw_schema_versions := resp .get ("getSchemaVersionList" )):
205+ return None
206+
207+ return SchemaVersionList (
208+ latest_version = SemanticVersionStruct .model_validate (
209+ raw_schema_versions .get ("latestVersion" , {})
210+ ),
211+ versions = [
212+ SemanticVersionStruct .model_validate (structs )
213+ for structs in raw_schema_versions .get ("semanticVersionList" , [])
214+ ],
215+ )
216+
217+ def get_versioned_dataset (
218+ self , dataset_urn : str , semantic_version : str
219+ ) -> dict [str , Any ]:
220+ variables = {
221+ "urn" : dataset_urn ,
222+ "versionStamp" : self ._get_version_timestamp (dataset_urn , semantic_version ),
223+ }
224+ resp = _execute_graphql (
225+ self ._graph ,
226+ query = entity_details_fragment_gql ,
227+ variables = variables ,
228+ operation_name = "getVersionedDataset" ,
229+ )
230+ return _clean_gql_response (resp .get ("versionedDataset" , {}))
231+
232+ def _get_version_timestamp (self , dataset_urn : str , semantic_version : str ):
233+ if not (schema_version_list := self .get_schema_version_list (dataset_urn )):
234+ raise ValueError (f"No schema_version_list found for dataset { dataset_urn } " )
235+
236+ version_stamp_mapping = {
237+ struct .semantic_version : struct .version_stamp
238+ for struct in schema_version_list .versions
239+ }
240+
241+ if not (version_stamp := version_stamp_mapping .get (semantic_version )):
242+ raise ValueError (
243+ f"Version '{ semantic_version } ' not found for dataset '{ dataset_urn } '"
244+ )
245+ return version_stamp
246+
247+
248+ @mcp .tool (
249+ description = "Get an entity by its DataHub URN. This also provide schema_version_list(latest version, all versions) if available."
250+ )
177251@async_background
178252def get_entity (urn : str ) -> dict :
179253 client = get_datahub_client ()
@@ -193,6 +267,22 @@ def get_entity(urn: str) -> dict:
193267
194268 _inject_urls_for_urns (client ._graph , result , ["" ])
195269
270+ if result .get ("urn" , "" ).startswith ("urn:li:dataset:" ):
271+ schema_api = DatasetSchemaAPI (client ._graph )
272+
273+ if schema_version_list := schema_api .get_schema_version_list (urn ):
274+ sorted_versions = sorted (
275+ [v .semantic_version for v in schema_version_list .versions ]
276+ )
277+ latest_semantic_version = (
278+ schema_version_list .latest_version .semantic_version
279+ )
280+
281+ result ["schemaVersionList" ] = {
282+ "latestVersion" : latest_semantic_version ,
283+ "versions" : sorted_versions ,
284+ }
285+
196286 return _clean_get_entity_response (result )
197287
198288
@@ -441,3 +531,13 @@ def get_lineage(
441531 lineage = lineage_api .get_lineage (asset_lineage_directive )
442532 _inject_urls_for_urns (client ._graph , lineage , ["*.searchResults[].entity" ])
443533 return lineage
534+
535+
536+ @mcp .tool (description = "Get schema from a dataset by its URN and version." )
537+ @async_background
538+ @lru_cache (maxsize = 20 )
539+ def get_versioned_dataset (dataset_urn : str , semantic_version : str ) -> dict :
540+ client = get_datahub_client ()
541+ schema_api = DatasetSchemaAPI (client ._graph )
542+
543+ return schema_api .get_versioned_dataset (dataset_urn , semantic_version )
0 commit comments