Revise misplaced columns in dynotears estimation, and fix a minor bug in dataset generation. by ThinkNaive · Pull Request #119 · mckinsey/causalnex

ThinkNaive · 2021-06-19T10:57:43Z

No description provided.

… in dataset generation.

.gitignore

mzjp2 · 2021-06-21T09:08:48Z

causalnex/structure/data_generators/wrappers.py

        )
-    intra_nodes = sorted(el for el in g.nodes if "_lag0" in el)
-    inter_nodes = sorted(el for el in g.nodes if "_lag0" not in el)
+    intra_nodes = sorted([el for el in g.nodes if "_lag0" in el], key=lambda t: t.split('_lag')[1])


sorted accepts a generator expression, doesn't need to (waste memory!) coercing it into a list first :)

sorted(el for el in g.nodes if "_lag0" not in el) will lead to misplaced columns, e.g. [0_lag1, 0_lag2, 0_lag3, 1_lag1, 1_lag2, 1_lag3, 2_lag1, 2_lag2, 2_lag3] (node first, then lag, case1). But the dynotears model accepts columns like [0_lag1, 1_lag1, 2_lag1, 0_lag2, 1_lag2, 2_lag2, 0_lag3, 1_lag3, 2_lag3] (lag first, then node, case2). The default sorted expression takes a case1 style. I've test the revised code using the code below.

__test__.py

# __test__.py from causalnex.structure.transformers import DynamicDataTransformer import os import random from matplotlib import ticker import numpy as np import networkx as nx import matplotlib.pyplot as plt from causalnex.structure.data_generators import gen_stationary_dyn_net_and_df from netgraph import InteractiveGraph from causalnex.structure.dynotears import from_pandas_dynamic import scipy.linalg as slin # metrics def count_accuracy(B_true, B_est) -> tuple: B_true = B_true != 0 B_est = B_est != 0 d = min(B_est.shape) # fdr fp = np.sum(~B_true & B_est) pp = np.sum(B_est) fdr = fp / pp # tpr tp = np.sum(B_true & B_est) tt = np.sum(B_true) tpr = tp / tt # fpr tf = d * (d - 1) / 2 - np.sum(B_true) + \ B_est.shape[0] * B_est.shape[1] - d * d fpr = fp / tf # shd shd = np.sum(B_true != B_est) # nnz nnz = pp return fdr, tpr, fpr, shd, nnz # reproduce def set_seed(seed): """ Referred from: - https://stackoverflow.com/questions/38469632/tensorflow-non-repeatable-results """ # Reproducibility random.seed(seed) np.random.seed(seed) try: os.environ["PYTHONHASHSEED"] = str(seed) except: pass # plot weighted directed graph def plot_graph(g, node_size=8, font_size=8): # define layout pos = {} for node in g.nodes(): # get node vertex index and slice index index = node.split('_lag') vertex = int(index[0]) + 1 slice = int(index[1]) # generate position with each slice as round shape x = np.random.uniform(low=-0.0, high=0.0) + 0.3 * \ np.cos(2 * np.pi * vertex / d) + 1.0 * (p - slice) y = np.random.uniform(low=-0.1, high=0.1) + \ np.sin(2 * np.pi * vertex / d) pos[node] = (x, y) g = nx.DiGraph(g) fig, ax = plt.subplots() plot_instance = InteractiveGraph( g, node_size=node_size, node_labels=True, node_label_fontdict=dict(size=font_size), node_layout=pos, arrows=True, ax=ax ) # transform graph to matrix def to_matrix(g): a = nx.to_numpy_array(g) # a /= np.abs(a).max() # permute array by order from max lag to instant. sorted_nodes = sorted(g.nodes(), key=lambda k: d * int(k[-1]) + int(k[0])) sorted_index = [list(g.nodes()).index(i) for i in sorted_nodes] p = np.zeros(a.shape) for y, x in enumerate(sorted_index): p[y, x] = 1.0 a = p.dot(a).dot(p.T) return a[:, :d] # plot matrix def plot_matrix(gt, ge=None, names=None, rng=2.0): # transform graph to matrix and compute layout bt = to_matrix(gt) d = bt.shape[1] n_col = bt.shape[0] // d if ge: be = to_matrix(ge) n_row = 2 b = [bt, be] else: n_row = 1 b = [bt] # plot matrix fig, ax = plt.subplots(n_row, n_col, figsize=(16, 6)) ax = ax.flatten() for row in range(n_row): for col in range(n_col): # split matrix for intra and inter-p mat = b[row][d * col: d * (col + 1), :] # plot matrix im = ax[row * n_col + col].imshow(mat, cmap="seismic", interpolation="none", vmin=-rng, vmax=rng) # add value labels for i in range(d): for j in range(d): color = 'white' if abs(mat[i, j]) > 0.3 * rng else 'black' value = '{:g}'.format(round(mat[i, j], 3)) ax[row * n_col + col].text(j, i, value, ha="center", va="center", color=color, fontsize=9) # hide axis ax[row * n_col + col].set_xticks([]) ax[row * n_col + col].set_yticks([]) # show graph type labels if col == 0: ax[row * n_col + col].set_ylabel(names[row], fontsize=10) # show instant and lagged labels if row == 0: if col == 0: label = '$W$ (Intra-slice)' else: label = '$A_{%d}$ (Inter-slice)' % col ax[row * n_col + col].set_xlabel(label, fontsize=10) ax[row * n_col + col].xaxis.set_label_position('top') # add colorbar cb = fig.colorbar(im, ax=ax, shrink=0.7) tick_locator = ticker.MaxNLocator(nbins=5) cb.locator = tick_locator cb.set_ticks([t - rng for t in range(int(rng) * 2 + 1)]) cb.update_ticks() return fig # compute loss def compute_loss(dataset, g, p): X, Xlags = DynamicDataTransformer( p=p).fit_transform(dataset, return_df=False) n, d_vars = X.shape wa = to_matrix(g) w_mat, a_mat = wa[: d_vars, :], wa[d_vars:, :] loss = ( 0.5 / n * np.square( np.linalg.norm( X.dot(np.eye(d_vars, d_vars) - w_mat) - Xlags.dot(a_mat), "fro" ) ) ) h = np.trace(slin.expm(w_mat * w_mat)) - d_vars return loss, h # main set_seed(0) d = 5 p = 3 # generate dataset g, df, intra, inter = gen_stationary_dyn_net_and_df( n_samples=500, num_nodes=5, p=3, w_min_intra=0.5, w_max_intra=2.0, w_min_inter=0.3, w_max_inter=0.5, degree_intra=4, degree_inter=1, w_decay=1.3 ) # convert dataset for learning dataset = df.iloc[:, :d] dataset.columns = [t.split('_lag')[0] for t in dataset.columns] # estimate graph gg = from_pandas_dynamic( time_series=[dataset], p=3, tau_w=0.01, tau_a=0.01, lambda_w=0.05, lambda_a=0.05) # compute metrics fdr, tpr, fpr, shd, nnz = count_accuracy( to_matrix(g), to_matrix(gg)) print("FDR:{:.2%} TPR:{:.2%} FPR:{:.2%} SHD:{:2d} NNZ:{:2d}".format( fdr, tpr, fpr, shd, nnz )) # compute loss for both true graph and estimated graph print('TrueGraph: loss={} h={}'.format(*compute_loss(dataset, g, p))) print('Estimated: loss={} h={}'.format(*compute_loss(dataset, gg, p))) # plot graph and matrix # plot_graph(g) # plot_graph(gg) fig = plot_matrix(g, gg, names=('True', 'Estimated')) # save image imgfile = "graph.svg" fig.savefig(imgfile, transparent=True) plt.show(block=True)

oentaryorj · 2022-09-02T03:15:48Z

Hi @ThinkNaive, thanks for the PR. It seems that the unit tests are failing now. Would you be able to help address them?

Revise misplaced columns in dynotears estimation, and fix a minor bug…

1ed7976

… in dataset generation.

mzjp2 reviewed Jun 21, 2021

View reviewed changes

.gitignore Outdated Show resolved Hide resolved

mzjp2 reviewed Jun 21, 2021

View reviewed changes

oentaryorj added the bug Something isn't working label Sep 8, 2021

oentaryorj added 8 commits November 3, 2021 21:39

Merge branch 'develop' into develop

b9b5e4d

Replace w_threshold with tau_w

1c56978

Update test_dynotears.py

eb61fff

Update .gitignore

32372c1

Merge branch 'develop' into develop

e734905

Merge branch 'develop' into develop

d128d82

Merge branch 'develop' into develop

0e39493

Merge branch 'develop' into develop

36727b5

oentaryorj assigned ThinkNaive Sep 12, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Revise misplaced columns in dynotears estimation, and fix a minor bug in dataset generation.#119

Revise misplaced columns in dynotears estimation, and fix a minor bug in dataset generation.#119
ThinkNaive wants to merge 9 commits intomckinsey:developfrom
ThinkNaive:develop

ThinkNaive commented Jun 19, 2021

Uh oh!

Uh oh!

mzjp2 Jun 21, 2021

Uh oh!

ThinkNaive Jun 21, 2021

Uh oh!

oentaryorj commented Sep 2, 2022

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Conversation

ThinkNaive commented Jun 19, 2021

Uh oh!

Uh oh!

mzjp2 Jun 21, 2021

Choose a reason for hiding this comment

Uh oh!

ThinkNaive Jun 21, 2021

Choose a reason for hiding this comment

Uh oh!

oentaryorj commented Sep 2, 2022

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants