# 连接是否带权重 if args.weighted: G = nx.read_edgelist(args.input, nodetype=int, data=(('weight', float),), create_using=nx.DiGraph()) else: G = nx.read_edgelist(args.input, nodetype=int, create_using=nx.DiGraph()) for edge in G.edges(): G[edge[0]][edge[1]]['weight'] = np.abs(np.random.randn())
defalias_setup(probs): ''' Compute utility lists for non-uniform sampling from discrete distributions. Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ for details ''' # 事件个数 K = len(probs) # q 关联 probs q = np.zeros(K) # J 别名 Alias J = np.zeros(K, dtype=np.int) # 两组 smaller = [] larger = [] # 将各个概率分成两组,一组的概率值大于 1,另一组小于 1 for kk, prob inenumerate(probs): q[kk] = K*prob if q[kk] < 1.0: smaller.append(kk) else: larger.append(kk)
# 贪心算法:概率小于 1 的不断填满 whilelen(smaller) > 0andlen(larger) > 0: small = smaller.pop() large = larger.pop()
J[small] = large q[large] = q[large] + q[small] - 1.0 if q[large] < 1.0: smaller.append(large) else: larger.append(large)
return J, q
1 2 3 4 5 6 7 8 9 10 11
defalias_draw(J, q): ''' Draw sample from a non-uniform discrete distribution using alias sampling. ''' K = len(J) kk = int(np.floor(np.random.rand()*K)) if np.random.rand() < q[kk]: return kk # 取自己本来就对应的事件 else: return J[kk] # 取 Alias事件
defget_alias_edge(src, dst): ''' Get the alias edge setup lists for a given edge. ''' p = args.p q = args.q unnormalized_probs = [] # 论文 3.2.2 节核心算法,计算各条边的转移权重 for dst_nbr insorted(G.neighbors(dst)): if dst_nbr == src: unnormalized_probs.append(G[dst][dst_nbr]['weight']/p) elif G.has_edge(dst_nbr, src): unnormalized_probs.append(G[dst][dst_nbr]['weight']) else: unnormalized_probs.append(G[dst][dst_nbr]['weight']/q) # 归一化各条边的转移权重 norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
# 执行 Alias Sampling return alias_setup(normalized_probs)
测试
1
get_alias_edge(15,16)
(array([0, 0]), array([1. , 0.99357163]))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
alias_nodes = {}
# 节点概率 alias sampling 和归一化 for node in G.nodes(): unnormalized_probs = [G[node][nbr]['weight'] for nbr insorted(G.neighbors(node))] norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] alias_nodes[node] = alias_setup(normalized_probs) # 信息展示 if node == 25: print('25号结点') print(unnormalized_probs) print(norm_const) print(normalized_probs) print(alias_nodes[node])
# 边概率 alias sampling 和归一化 if is_directed: for edge in G.edges(): alias_edges[edge] = get_alias_edge(edge[0], edge[1]) else: for edge in G.edges(): alias_edges[edge] = get_alias_edge(edge[0], edge[1]) alias_edges[(edge[1], edge[0])] = get_alias_edge(edge[1], edge[0])
defnode2vec_walk(self, walk_length, start_node): ''' Simulate a random walk starting from start node. ''' G = self.G alias_nodes = self.alias_nodes alias_edges = self.alias_edges
defsimulate_walks(self, num_walks, walk_length): ''' Repeatedly simulate random walks from each node. ''' G = self.G walks = [] nodes = list(G.nodes()) print('Walk iteration:') for walk_iter inrange(num_walks): print(str(walk_iter+1), '/', str(num_walks)) random.shuffle(nodes) for node in nodes: walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))
return walks
defget_alias_edge(self, src, dst): ''' Get the alias edge setup lists for a given edge. ''' G = self.G p = self.p q = self.q
unnormalized_probs = [] for dst_nbr insorted(G.neighbors(dst)): if dst_nbr == src: unnormalized_probs.append(G[dst][dst_nbr]['weight']/p) elif G.has_edge(dst_nbr, src): unnormalized_probs.append(G[dst][dst_nbr]['weight']) else: unnormalized_probs.append(G[dst][dst_nbr]['weight']/q) norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs]
return alias_setup(normalized_probs)
defpreprocess_transition_probs(self): ''' Preprocessing of transition probabilities for guiding the random walks. ''' G = self.G is_directed = self.is_directed
alias_nodes = {} for node in G.nodes(): unnormalized_probs = [G[node][nbr]['weight'] for nbr insorted(G.neighbors(node))] norm_const = sum(unnormalized_probs) normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] alias_nodes[node] = alias_setup(normalized_probs)
alias_edges = {} triads = {}
if is_directed: for edge in G.edges(): alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) else: for edge in G.edges(): alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])
''' Reference implementation of node2vec. Author: Aditya Grover For more details, refer to the paper: node2vec: Scalable Feature Learning for Networks Aditya Grover and Jure Leskovec Knowledge Discovery and Data Mining (KDD), 2016 '''
import argparse import numpy as np import networkx as nx import node2vec from gensim.models import Word2Vec
parser.add_argument('--directed', dest='directed', action='store_true', help='Graph is (un)directed. Default is undirected.') parser.add_argument('--undirected', dest='undirected', action='store_false') parser.set_defaults(directed=False)
return parser.parse_args(args=[])
defread_graph(): ''' Reads the input network in networkx. ''' if args.weighted: G = nx.read_edgelist(args.input, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph()) else: G = nx.read_edgelist(args.input, nodetype=int, create_using=nx.DiGraph()) for edge in G.edges(): G[edge[0]][edge[1]]['weight'] = 1
ifnot args.directed: G = G.to_undirected()
return G
deflearn_embeddings(walks): ''' Learn embeddings by optimizing the Skipgram objective using SGD. ''' walks = [map(str, walk) for walk in walks] model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, iter=args.iter) model.save_word2vec_format(args.output)
return
defmain(args): ''' Pipeline for representational learning for all nodes in a graph. ''' nx_G = read_graph() G = node2vec.Graph(nx_G, args.directed, args.p, args.q) G.preprocess_transition_probs() walks = G.simulate_walks(args.num_walks, args.walk_length) learn_embeddings(walks)
if __name__ == "__main__": args = parse_args() main(args)