I use GitHub a lot for schoolwork and personal projects. It is a wonderful platform for deploying, publishing, and collaborating on coding project. Because of how nifty it is, I am excited to explore this graph on GitHub project membership. I am not sure how the data was pre-precessed. By looking at the interactive network data visualization and analytics platform, it appears that all the nodes are connected. This only has 121,709 nodes compared to GitHub's over 100 million total projects, so it is likely that this is the largest connected component or a subgraph of the largest connected component. It was taking way too long to process, so I ended up taking a random sample of 40,000 nodes and extracting the largest connected component, which ended up having 19,498 nodes. This subgraph still required about seven hours to run the whole notebook.
This graph is bipartite, with one collection of nodes representing membership and the other representing projects. I think member nodes with high centrality values will likely senior engineers at larger companies, or are significant figures in the field. Project nodes with high centrality values will likely be highly used or integral projects. I do not think the degree distribution will exhibit a power law because highly involved developers only have so much time. Projects can also only have so much people working on them or else it is very hard to keep everyone up to date and efficient. I don't know if it will exhibit the small world property. I think this graph might have the small world property becuase it is much smaller than all of GitHub and likely has the most important projects and people, but the total GitHub network would probably not.
import networkx as nx
import numpy as np
def num_vertices(edge_list):
counter_set = set()
for edge in edge_list:
counter_set.add(edge[0])
counter_set.add(edge[1])
return len(counter_set)
def vertex_degree(edge_list, vertex_index):
degree_count = 0
for edge in edge_list:
if vertex_index in edge:
degree_count += 1
return degree_count
def clustering_coef(edge_list, vertex_index):
edge_dict = {}
for edge in edge_list:
if edge[0] not in edge_dict.keys():
edge_dict[edge[0]] = []
if edge[1] not in edge_dict.keys():
edge_dict[edge[1]] = []
edge_dict[edge[0]] += [edge[1]]
edge_dict[edge[1]] += [edge[0]]
connected_nodes = edge_dict[vertex_index]
num_connections_bet_nodes = 0
for connection in connected_nodes:
con_list = edge_dict[connection]
con_list.pop(con_list.index(vertex_index))
for node in con_list:
if node in connected_nodes:
num_connections_bet_nodes += 1
num_connections_bet_nodes /= 2
return num_connections_bet_nodes / (
(len(connected_nodes) * (len(connected_nodes) - 1)) / 2
)
def betweennes_centrality(edge_list, vertex_index):
graph = nx.Graph()
for edge in edge_list:
graph.add_edge(edge[0], edge[1])
node_list = list(graph.nodes)
sum_bet_cent = 0
counter = 0
total_counter = 0
for node in node_list[:-1]:
for node2 in node_list[node_list.index(node) + 1 :]:
if node == vertex_index or node2 == vertex_index:
continue
else:
for path in nx.all_shortest_paths(graph, node, node2):
if vertex_index in path:
counter += 1
total_counter += 1
sum_bet_cent += counter / total_counter
counter = 0
total_counter = 0
return sum_bet_cent
def average_shortest_path_length(edge_list):
graph = nx.Graph()
for edge in edge_list:
graph.add_edge(edge[0], edge[1])
node_list = list(graph.nodes)
counter = 0
sum_length = 0
for i in range(len(node_list) - 1):
for j in range(i + 1, len(node_list)):
if nx.has_path(graph, node_list[i], node_list[j]):
sum_length += nx.shortest_path_length(graph, node_list[i], node_list[j])
counter += 1
if counter == 0:
return 0
return sum_length / counter
def adjacency_matrix(edge_list):
num_vert = num_vertices(edge_list)
mat = np.zeros((num_vert, num_vert))
for edge in edge_list:
mat[edge[0] - 1][edge[1] - 1] = 1
mat[edge[1] - 1][edge[0] - 1] = 1
return mat
def ec_eigen_centrality(adj_mat):
# input is already transposed
eigen = np.ones(len(adj_mat))
last_eigen = np.zeros(len(adj_mat))
max_dif = 0.0001
last_dif = np.Inf
while this_dif := abs(sum(last_eigen - eigen)) > max_dif:
if abs(this_dif) == abs(last_dif):
break
last_dif = this_dif
last_eigen = eigen.copy()
eigen = np.matmul(adj_mat, eigen)
max_num = max(eigen)
for j in range(len(eigen)):
eigen[j] = eigen[j] / max_num
return eigen
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import random
gh_graph_org = nx.read_edgelist("rec-github.edges", create_using = nx.Graph(), nodetype=int)
random_nodes_from_gh_graph = random.sample(gh_graph_org.nodes, 40000)
random_subgraph_from_gh = gh_graph_org.subgraph(random_nodes_from_gh_graph)
largest_cc_nodes = max(nx.connected_components(random_subgraph_from_gh), key=len)
gh_graph = gh_graph_org.subgraph(largest_cc_nodes)
print(len(gh_graph))
19498
plt.figure(figsize=(10,10))
nx.draw_spring(gh_graph, with_labels=False, node_size=[gh_graph.degree(v) for v in gh_graph])
node_degrees = [(gh_graph.degree(v), v) for v in gh_graph]
sorted_degrees = sorted(node_degrees, key=lambda x: x[0], reverse=True)
for i in range(10):
print(f"{i}. Node {sorted_degrees[i][1]} has degree {sorted_degrees[i][0]}")
0. Node 654 has degree 532 1. Node 301 has degree 498 2. Node 616 has degree 468 3. Node 8 has degree 395 4. Node 866 has degree 376 5. Node 81 has degree 341 6. Node 650 has degree 315 7. Node 518 has degree 305 8. Node 212 has degree 294 9. Node 564 has degree 293
bet_cent = nx.betweenness_centrality(gh_graph, endpoints=True)
sorted_bet_cent = sorted(bet_cent.items(), key=lambda x: x[1], reverse=True)
for i in range(10):
print(f"{i}. Node {sorted_bet_cent[i][0]} has betweenness centrality {sorted_bet_cent[i][1]}")
0. Node 301 has betweenness centrality 0.06329461106590673 1. Node 654 has betweenness centrality 0.058230759235956636 2. Node 616 has betweenness centrality 0.04720055753877458 3. Node 8 has betweenness centrality 0.044279421324421045 4. Node 81 has betweenness centrality 0.039412193220971364 5. Node 518 has betweenness centrality 0.03428559661706492 6. Node 212 has betweenness centrality 0.03314051968697511 7. Node 866 has betweenness centrality 0.03158174218865254 8. Node 94 has betweenness centrality 0.03131432324128569 9. Node 564 has betweenness centrality 0.029717606097352835
The ten chosen were the first 10 in the sorted list of node clustering coefficients
clustering_coeffs = nx.clustering(gh_graph)
sorted_coeffs = sorted(clustering_coeffs.items(), reverse=True, key=lambda x: x[1])
for i in range(10):
print(f"{i}. Node {sorted_coeffs[i][0]} has clustering coefficient {sorted_coeffs[i][1]}")
0. Node 33636 has clustering coefficient 1.0 1. Node 1004 has clustering coefficient 1.0 2. Node 66560 has clustering coefficient 1.0 3. Node 37013 has clustering coefficient 1.0 4. Node 37424 has clustering coefficient 1.0 5. Node 37884 has clustering coefficient 1.0 6. Node 5359 has clustering coefficient 1.0 7. Node 71534 has clustering coefficient 1.0 8. Node 39100 has clustering coefficient 1.0 9. Node 40090 has clustering coefficient 1.0
eigen_centrality = nx.eigenvector_centrality(gh_graph, max_iter=400)
sorted_eigens = sorted(eigen_centrality.items(), reverse=True, key=lambda x: x[1])
for i in range(10):
print(f"{i}. Node {sorted_eigens[i][0]} has eigenvector centrality {sorted_eigens[i][1]}")
0. Node 654 has eigenvector centrality 0.21237810554763278 1. Node 616 has eigenvector centrality 0.18851476693646493 2. Node 301 has eigenvector centrality 0.17947860521926398 3. Node 8 has eigenvector centrality 0.15324041452261145 4. Node 518 has eigenvector centrality 0.13662177016604668 5. Node 542 has eigenvector centrality 0.1319877969418028 6. Node 94 has eigenvector centrality 0.13086384933196013 7. Node 564 has eigenvector centrality 0.12867042242313892 8. Node 16 has eigenvector centrality 0.11727428233171622 9. Node 650 has eigenvector centrality 0.10842289238724852
pagerank = nx.pagerank(gh_graph, max_iter=400)
sorted_pagerank = sorted(pagerank.items(), reverse=True, key=lambda x: x[1])
for i in range(10):
print(f"{i}. Node {sorted_pagerank[i][0]} has pagerank {sorted_pagerank[i][1]}")
0. Node 301 has pagerank 0.003943207488007591 1. Node 654 has pagerank 0.0037798114051954194 2. Node 866 has pagerank 0.00348094307728918 3. Node 616 has pagerank 0.0033272334399155157 4. Node 81 has pagerank 0.0030444101878482404 5. Node 8 has pagerank 0.0029345473539133683 6. Node 650 has pagerank 0.002331479163558558 7. Node 212 has pagerank 0.002330785388422002 8. Node 518 has pagerank 0.0021947006020938066 9. Node 83 has pagerank 0.0021913540186490857
All of the measures above tend to have many of the same nodes. Clustering coefficient is the only one that has significantly different values, but they all have a clustering coefficient of 1. The top nodes in the other measures would probably also have a clustering coefficient of 1, but there might just be too many. I think this shows that the people or projects who are the most integral are truly the most important in the graph. Nodes 301, 654, 866, and 8 seem to especially important. It would be interesting to see which projects or people these nodes represent.
import math
ave_shorted_path = nx.average_shortest_path_length(gh_graph)
print(f"Average shortest path length is {ave_shorted_path}")
print(f"Natural log of number of nodes is {math.log(len(gh_graph))}")
print(f"Log base 10 of number of nodes is {math.log(len(gh_graph), 10)}")
print(f"Log base 2 of number of nodes is {math.log(len(gh_graph), 2)}")
# print(f"The small-world coefficient omega is {nx.omega(gh_graph)}")
Average shortest path length is 4.895764638205489 Natural log of number of nodes is 9.878067175189218 Log base 10 of number of nodes is 4.289990066054319 Log base 2 of number of nodes is 14.251038527213536
This graph seems to exhibit small-world behavior. The average shortest path is very close to log base 10 of the number of nodes. I tried to get the omega small-world coefficient, but it ran for over 16 hours without finishing.
dict_degrees = dict(nx.degree(gh_graph))
degree_vals = dict_degrees.values()
unique, count = np.unique(list(degree_vals), return_counts=True)
plt.scatter(unique, count)
plt.xscale('log', basex=2)
plt.yscale('log',basey=2)
plt.xlabel('log(degree)')
plt.ylabel('log(frequency)')
Text(0, 0.5, 'log(frequency)')
This graph shows power law behavior pretty clearly. What a nice line!
dict_degrees = dict(nx.degree(gh_graph))
degree_vals = dict_degrees.values()
unique, count = np.unique(list(degree_vals), return_counts=True)
clust_degree_dict = {}
for node, degree in dict_degrees.items():
if clust_degree_dict.get(degree) is None:
clust_degree_dict[degree] = [0,0]
clust_degree_dict[degree][0] += clustering_coeffs[node]
clust_degree_dict[degree][1] += 1
final_dict = {}
for degree, clust in clust_degree_dict.items():
count = clust[1]
sum = clust[0]
final_dict[degree] = sum/count
plt.scatter(final_dict.keys(), final_dict.values())
plt.xscale('log', basex=2)
plt.yscale('log',basey=2)
plt.xlabel('log(degree)')
plt.ylabel('log(average clustering coefficient)')
Text(0, 0.5, 'log(average clustering coefficient)')
The clustering coefficient is either 1 or 0 for this graph. I'm not sure why that is.