tensorflow
diff --git a/‎tensorflow_gnn/experimental/sampler/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_gnn/experimental/sampler/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_gnn/experimental/sampler/beam/BUILD‎
Lines changed: 54 additions & 0 deletions b/‎tensorflow_gnn/experimental/sampler/beam/BUILD‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎tensorflow_gnn/experimental/sampler/beam/sampler.py‎
Lines changed: 274 additions & 0 deletions b/‎tensorflow_gnn/experimental/sampler/beam/sampler.py‎
Lines changed: 274 additions & 0 deletions
@@ -29,6 +29,7 @@
 # Export.
 create_program = eval_dag.create_program
 save_model = eval_dag.save_model
+Artifacts = eval_dag.Artifacts
 
 # Sampling layers.
 InMemUniformEdgesSampler = core.InMemUniformEdgesSampler
 
@@ -127,3 +127,57 @@ pytype_strict_contrib_test(
         "//:expect_tensorflow_installed",
     ],
 )
+
+pytype_strict_library(
+    name = "unigraph_utils",
+    srcs = ["unigraph_utils.py"],
+    srcs_version = "PY3ONLY",
+    deps = [
+        "//third_party/py/apache_beam",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//tensorflow_gnn",
+        "//tensorflow_gnn/data:unigraph",
+        "//tensorflow_gnn/sampler:sampling_spec_py_proto",
+    ],
+)
+
+py_binary(
+    name = "sampler",
+    srcs = ["sampler.py"],
+    deps = [
+        ":accessors",
+        ":edge_samplers",
+        ":executor_lib",
+        ":unigraph_utils",
+        "//third_party/py/absl:app",
+        "//third_party/py/absl/flags",
+        "//third_party/py/absl/logging",
+        "//third_party/py/apache_beam",
+        "//:expect_tensorflow_installed",
+        "//tensorflow_gnn",
+        "//tensorflow_gnn/data:unigraph",
+        "//tensorflow_gnn/experimental/sampler",
+        "//tensorflow_gnn/experimental/sampler:subgraph_pipeline",
+        "//tensorflow_gnn/proto:graph_schema_py_proto",
+        "//tensorflow_gnn/sampler:sampling_spec_py_proto",
+    ],
+)
+
+pytype_strict_contrib_test(
+    name = "unigraph_utils_test",
+    srcs = ["unigraph_utils_test.py"],
+    data = ["@tensorflow_gnn//testdata/heterogeneous"],
+    python_version = "PY3",
+    srcs_version = "PY3ONLY",
+    deps = [
+        ":unigraph_utils",
+        "//testing/pybase",
+        "//third_party/py/apache_beam",
+        "//:expect_numpy_installed",
+        "//:expect_tensorflow_installed",
+        "//tensorflow_gnn",
+        "//tensorflow_gnn/data:unigraph",
+        "//tensorflow_gnn/utils:test_utils",
+    ],
+)
@@ -0,0 +1,274 @@
+# Copyright 2023 The TensorFlow GNN Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs sampling pipeline defined by the GraphSchema and SamplingSpec.
+
+Closely follows V1.
+"""
+
+import os
+from typing import Optional
+
+from absl import app
+from absl import flags
+from absl import logging
+import apache_beam as beam
+from apache_beam.options.pipeline_options import PipelineOptions
+from apache_beam.options.pipeline_options import SetupOptions
+import tensorflow as tf
+import tensorflow_gnn as tfgnn
+from tensorflow_gnn.data import unigraph
+from tensorflow_gnn.experimental import sampler
+from tensorflow_gnn.experimental.sampler import subgraph_pipeline
+from tensorflow_gnn.experimental.sampler.beam import accessors  # pylint: disable=unused-import
+from tensorflow_gnn.experimental.sampler.beam import edge_samplers  # pylint: disable=unused-import
+from tensorflow_gnn.experimental.sampler.beam import executor_lib
+from tensorflow_gnn.experimental.sampler.beam import unigraph_utils
+from tensorflow_gnn.proto import graph_schema_pb2
+from tensorflow_gnn.sampler import sampling_spec_pb2
+
+from google.protobuf import text_format
+
+
+_DIRECT_RUNNER = 'DirectRunner'
+_DATAFLOW_RUNNER = 'DataflowRunner'
+
+
+def _get_shape(feature: graph_schema_pb2.Feature) -> tf.TensorShape:
+  dim_fn = lambda dim: (None if dim.size == -1 else dim.size)
+  dims = [dim_fn(dim) for dim in feature.shape.dim]
+  return tf.TensorShape(dims)
+
+
+def get_sampling_model(
+    graph_schema: tfgnn.GraphSchema,
+    sampling_spec: sampling_spec_pb2.SamplingSpec,
+) -> tf.keras.Model:
+  """Constructs sampling model from schema and sampling spec.
+
+  Args:
+    graph_schema: Attribute `edge_sets` identifies end-point node set names.
+    sampling_spec: The number of nodes sampled from edge set. The spec defines
+      the structure of the sampled subgraphs, that look like rooted trees,
+      possibly densified adding all pairwise edges between sampled nodes.
+
+  Returns:
+    A Keras model for sampling.
+  """
+
+  def edge_sampler_factory(
+      op: sampling_spec_pb2.SamplingOp,
+  ) -> sampler.UniformEdgesSampler:
+    accessor = sampler.KeyToTfExampleAccessor(
+        sampler.InMemStringKeyToBytesAccessor(
+            keys_to_values={'b': b'b'}),
+        features_spec={
+            '#target': tf.TensorSpec([None], tf.string),
+        },
+    )
+
+    sample_size = op.sample_size
+    edge_target_feature_name = '#target'
+    return sampler.UniformEdgesSampler(
+        outgoing_edges_accessor=accessor,
+        sample_size=sample_size,
+        edge_target_feature_name=edge_target_feature_name,
+        name=f'edges/{op.edge_set_name}'
+    )
+
+  def node_features_accessor_factory(
+      node_set_name: tfgnn.NodeSetName,
+  ) -> sampler.KeyToTfExampleAccessor:
+    node_features = graph_schema.node_sets[node_set_name].features
+    features_spec = {}
+    for name, feature in node_features.items():
+      shape = _get_shape(feature)
+      dtype = tf.dtypes.as_dtype(feature.dtype)
+      features_spec[name] = tf.TensorSpec(shape, dtype)
+    accessor = sampler.KeyToTfExampleAccessor(
+        sampler.InMemStringKeyToBytesAccessor(
+            keys_to_values={'b': b'b'},
+            name=f'nodes/{node_set_name}'),
+        features_spec=features_spec,
+    )
+    return accessor
+
+  return subgraph_pipeline.create_sampling_model_from_spec(
+      graph_schema,
+      sampling_spec,
+      edge_sampler_factory=edge_sampler_factory,
+      node_features_accessor_factory=node_features_accessor_factory,
+  )
+
+
+def _create_beam_runner(
+    runner_name: Optional[str],
+) -> beam.runners.PipelineRunner:
+  """Creates appropriate runner."""
+  if runner_name == _DIRECT_RUNNER:
+    runner = beam.runners.DirectRunner()
+  elif runner_name == _DATAFLOW_RUNNER:
+    runner = beam.runners.DataflowRunner()
+  else:
+    runner = None
+  return runner
+
+
+def save_artifacts(artifacts: sampler.Artifacts, artifacts_path: str) -> None:
+  for layer_id, model in artifacts.models.items():
+    path = os.path.join(artifacts_path, layer_id)
+    tf.io.gfile.makedirs(path)
+    sampler.save_model(model, path)
+
+
+def define_flags():
+  """Creates commandline flags."""
+
+  flags.DEFINE_string(
+      'graph_schema',
+      None,
+      'Path to a text-formatted GraphSchema proto file or directory '
+      'containing one for a graph in Universal Graph Format. This '
+      'defines the input graph to be sampled.',
+  )
+
+  flags.DEFINE_string(
+      'data_path',
+      None,
+      'Path to data files for node and edge sets. Defaults to the directory '
+      'containing graph_schema.',
+  )
+
+  flags.DEFINE_string(
+      'input_seeds',
+      None,
+      'Path to an input file with the seed node ids to restrict sampling over. '
+      'The file can be in any of the supported unigraph table formats, and as '
+      "for node sets, the 'id' column will be used. If the seeds aren't "
+      'specified, the full set of nodes from the graph will be used '
+      '(optional).',
+  )
+
+  flags.DEFINE_string(
+      'sampling_spec',
+      None,
+      'An input file with a text-formatted SamplingSpec proto to use. This is '
+      "a required input and to some extent may mirror some of the schema's "
+      'structure. See `sampling_spec.proto` for details on the configuration.',
+  )
+
+  flags.DEFINE_string(
+      'output_samples',
+      None,
+      'Output file with serialized graph tensor Example protos.',
+  )
+
+  runner_choices = [_DIRECT_RUNNER, _DATAFLOW_RUNNER]
+  runner_choices.append('flume')
+  flags.DEFINE_enum(
+      'runner',
+      None,
+      runner_choices,
+      'The underlying runner; if not specified, use the default runner.',
+  )
+
+  flags.mark_flags_as_required(
+      ['graph_schema', 'sampling_spec', 'output_samples']
+  )
+
+
+def app_main(argv) -> None:
+  """Main sampler entrypoint.
+
+  Args:
+    argv: List of arguments passed by flags parser.
+  """
+  FLAGS = flags.FLAGS  # pylint: disable=invalid-name
+  pipeline_args = argv[1:]
+  graph_schema: tfgnn.GraphSchema = unigraph.read_schema(FLAGS.graph_schema)
+
+  data_path = os.path.dirname(FLAGS.graph_schema)
+  with tf.io.gfile.GFile(FLAGS.sampling_spec, 'r') as f:
+    sampling_spec = text_format.Parse(
+        f.read(), sampling_spec_pb2.SamplingSpec()
+    )
+  # we have graph schema which defines Graph...
+  # and sampling spec which defines how to sample in V1 format.
+  # 1. Let's define sampling model as TF keras model.
+  # Example:
+  #  model = get_sampling_model(mag_graph_schema, mag_sampling_spec)
+  #  model(tf.ragged.constant([[0], [1]]))
+  #  # returns GraphTensor for seed papers 0 and 1.
+
+  model = get_sampling_model(graph_schema, sampling_spec)
+  # Export sampling model as a "sampling program".
+  program_pb, artifacts = sampler.create_program(model)
+  # here `eval_dag` defines Beam stages to run, artifacts are TF models
+  # for some Beam stages.
+
+  if not FLAGS.data_path:
+    data_path = os.path.dirname(FLAGS.graph_schema)
+  else:
+    data_path = FLAGS.data_path
+
+  output_dir = os.path.dirname(FLAGS.output_samples)
+  artifacts_path = os.path.join(output_dir, 'artifacts')
+  if tf.io.gfile.exists(artifacts_path):
+    raise ValueError(f'{artifacts_path} already exists.')
+
+  tf.io.gfile.makedirs(artifacts_path)
+  save_artifacts(artifacts, artifacts_path)
+
+  pipeline_options = PipelineOptions(pipeline_args)
+  pipeline_options.view_as(SetupOptions).save_main_session = True
+
+  with beam.Pipeline(
+      runner=_create_beam_runner(FLAGS.runner), options=pipeline_options
+  ) as root:
+    feeds = (root
+             | unigraph_utils.ReadAndConvertUnigraph(graph_schema, data_path))
+    if FLAGS.input_seeds:
+      seeds = unigraph_utils.read_seeds(root, FLAGS.input_seeds)
+    else:
+      seeds = unigraph_utils.seeds_from_graph_dict(feeds)
+    inputs = {
+        'Input': seeds,
+    }
+    examples = executor_lib.execute(
+        program_pb,
+        inputs,
+        feeds=feeds,
+        artifacts_path=artifacts_path
+    )
+    # results are tuple: example_id to tf.Example with graph tensors.
+    coder = beam.coders.ProtoCoder(tf.train.Example)
+    _ = (
+        examples
+        | 'DropExampleId' >> beam.Values()
+        | 'WriteToTFRecord'
+        >> beam.io.WriteToTFRecord(
+            os.path.join(output_dir, 'examples.tfrecord'), coder=coder
+        )
+    )
+    logging.info('Pipeline complete')
+
+
+def main():
+  define_flags()
+  app.run(
+      app_main, flags_parser=lambda argv: flags.FLAGS(argv, known_only=True)
+  )
+
+if __name__ == '__main__':
+  main()