import networkx as nx
import numpy as np


def num_vertices(edge_list):
    counter_set = set()
    for edge in edge_list:
        counter_set.add(edge[0])
        counter_set.add(edge[1])
    return len(counter_set)


def vertex_degree(edge_list, vertex_index):
    degree_count = 0
    for edge in edge_list:
        if vertex_index in edge:
            degree_count += 1
    return degree_count


def clustering_coef(edge_list, vertex_index):
    edge_dict = {}
    for edge in edge_list:
        if edge[0] not in edge_dict.keys():
            edge_dict[edge[0]] = []
        if edge[1] not in edge_dict.keys():
            edge_dict[edge[1]] = []
        edge_dict[edge[0]] += [edge[1]]
        edge_dict[edge[1]] += [edge[0]]
    connected_nodes = edge_dict[vertex_index]
    num_connections_bet_nodes = 0
    for connection in connected_nodes:
        con_list = edge_dict[connection]
        con_list.pop(con_list.index(vertex_index))
        for node in con_list:
            if node in connected_nodes:
                num_connections_bet_nodes += 1
    num_connections_bet_nodes /= 2
    return num_connections_bet_nodes / (
        (len(connected_nodes) * (len(connected_nodes) - 1)) / 2
    )


def betweennes_centrality(edge_list, vertex_index):
    graph = nx.Graph()
    for edge in edge_list:
        graph.add_edge(edge[0], edge[1])
    node_list = list(graph.nodes)
    sum_bet_cent = 0
    counter = 0
    total_counter = 0
    for node in node_list[:-1]:
        for node2 in node_list[node_list.index(node) + 1 :]:
            if node == vertex_index or node2 == vertex_index:
                continue
            else:
                for path in nx.all_shortest_paths(graph, node, node2):
                    if vertex_index in path:
                        counter += 1
                    total_counter += 1
                sum_bet_cent += counter / total_counter
                counter = 0
                total_counter = 0
    return sum_bet_cent


def average_shortest_path_length(edge_list):
    graph = nx.Graph()
    for edge in edge_list:
        graph.add_edge(edge[0], edge[1])
    node_list = list(graph.nodes)
    counter = 0
    sum_length = 0
    for i in range(len(node_list) - 1):
        for j in range(i + 1, len(node_list)):
            if nx.has_path(graph, node_list[i], node_list[j]):
                sum_length += nx.shortest_path_length(graph, node_list[i], node_list[j])
                counter += 1
    if counter == 0:
        return 0
    return sum_length / counter


def adjacency_matrix(edge_list):
    num_vert = num_vertices(edge_list)
    mat = np.zeros((num_vert, num_vert))
    for edge in edge_list:
        mat[edge[0] - 1][edge[1] - 1] = 1
        mat[edge[1] - 1][edge[0] - 1] = 1
    return mat


def ec_eigen_centrality(adj_mat):
    # input is already transposed

    eigen = np.ones(len(adj_mat))
    last_eigen = np.zeros(len(adj_mat))

    max_dif = 0.0001
    last_dif = np.Inf

    while this_dif := abs(sum(last_eigen - eigen)) > max_dif:
        if abs(this_dif) == abs(last_dif):
            break
        last_dif = this_dif
        last_eigen = eigen.copy()
        eigen = np.matmul(adj_mat, eigen)
        max_num = max(eigen)
        for j in range(len(eigen)):
            eigen[j] = eigen[j] / max_num
    return eigen


import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import random


gh_graph_org = nx.read_edgelist("rec-github.edges", create_using = nx.Graph(), nodetype=int)


random_nodes_from_gh_graph = random.sample(gh_graph_org.nodes, 40000)
random_subgraph_from_gh = gh_graph_org.subgraph(random_nodes_from_gh_graph)
largest_cc_nodes = max(nx.connected_components(random_subgraph_from_gh), key=len)
gh_graph = gh_graph_org.subgraph(largest_cc_nodes)
print(len(gh_graph))

19498


plt.figure(figsize=(10,10))
nx.draw_spring(gh_graph, with_labels=False, node_size=[gh_graph.degree(v) for v in gh_graph])


node_degrees = [(gh_graph.degree(v), v) for v in gh_graph]
sorted_degrees = sorted(node_degrees, key=lambda x: x[0], reverse=True)

for i in range(10):
    print(f"{i}. Node {sorted_degrees[i][1]} has degree {sorted_degrees[i][0]}")

0. Node 654 has degree 532
1. Node 301 has degree 498
2. Node 616 has degree 468
3. Node 8 has degree 395
4. Node 866 has degree 376
5. Node 81 has degree 341
6. Node 650 has degree 315
7. Node 518 has degree 305
8. Node 212 has degree 294
9. Node 564 has degree 293


bet_cent = nx.betweenness_centrality(gh_graph, endpoints=True)


sorted_bet_cent = sorted(bet_cent.items(), key=lambda x: x[1], reverse=True)

for i in range(10):
    print(f"{i}. Node {sorted_bet_cent[i][0]} has betweenness centrality {sorted_bet_cent[i][1]}")

0. Node 301 has betweenness centrality 0.06329461106590673
1. Node 654 has betweenness centrality 0.058230759235956636
2. Node 616 has betweenness centrality 0.04720055753877458
3. Node 8 has betweenness centrality 0.044279421324421045
4. Node 81 has betweenness centrality 0.039412193220971364
5. Node 518 has betweenness centrality 0.03428559661706492
6. Node 212 has betweenness centrality 0.03314051968697511
7. Node 866 has betweenness centrality 0.03158174218865254
8. Node 94 has betweenness centrality 0.03131432324128569
9. Node 564 has betweenness centrality 0.029717606097352835


clustering_coeffs = nx.clustering(gh_graph)
sorted_coeffs = sorted(clustering_coeffs.items(), reverse=True, key=lambda x: x[1])

for i in range(10):
    print(f"{i}. Node {sorted_coeffs[i][0]} has clustering coefficient {sorted_coeffs[i][1]}")

0. Node 33636 has clustering coefficient 1.0
1. Node 1004 has clustering coefficient 1.0
2. Node 66560 has clustering coefficient 1.0
3. Node 37013 has clustering coefficient 1.0
4. Node 37424 has clustering coefficient 1.0
5. Node 37884 has clustering coefficient 1.0
6. Node 5359 has clustering coefficient 1.0
7. Node 71534 has clustering coefficient 1.0
8. Node 39100 has clustering coefficient 1.0
9. Node 40090 has clustering coefficient 1.0


eigen_centrality = nx.eigenvector_centrality(gh_graph, max_iter=400)
sorted_eigens = sorted(eigen_centrality.items(), reverse=True, key=lambda x: x[1])

for i in range(10):
    print(f"{i}. Node {sorted_eigens[i][0]} has eigenvector centrality {sorted_eigens[i][1]}")

0. Node 654 has eigenvector centrality 0.21237810554763278
1. Node 616 has eigenvector centrality 0.18851476693646493
2. Node 301 has eigenvector centrality 0.17947860521926398
3. Node 8 has eigenvector centrality 0.15324041452261145
4. Node 518 has eigenvector centrality 0.13662177016604668
5. Node 542 has eigenvector centrality 0.1319877969418028
6. Node 94 has eigenvector centrality 0.13086384933196013
7. Node 564 has eigenvector centrality 0.12867042242313892
8. Node 16 has eigenvector centrality 0.11727428233171622
9. Node 650 has eigenvector centrality 0.10842289238724852


pagerank = nx.pagerank(gh_graph, max_iter=400)
sorted_pagerank = sorted(pagerank.items(), reverse=True, key=lambda x: x[1])

for i in range(10):
    print(f"{i}. Node {sorted_pagerank[i][0]} has pagerank {sorted_pagerank[i][1]}")

0. Node 301 has pagerank 0.003943207488007591
1. Node 654 has pagerank 0.0037798114051954194
2. Node 866 has pagerank 0.00348094307728918
3. Node 616 has pagerank 0.0033272334399155157
4. Node 81 has pagerank 0.0030444101878482404
5. Node 8 has pagerank 0.0029345473539133683
6. Node 650 has pagerank 0.002331479163558558
7. Node 212 has pagerank 0.002330785388422002
8. Node 518 has pagerank 0.0021947006020938066
9. Node 83 has pagerank 0.0021913540186490857


import math
ave_shorted_path = nx.average_shortest_path_length(gh_graph)
print(f"Average shortest path length is {ave_shorted_path}")
print(f"Natural log of number of nodes is {math.log(len(gh_graph))}")
print(f"Log base 10 of number of nodes is {math.log(len(gh_graph), 10)}")
print(f"Log base 2 of number of nodes is {math.log(len(gh_graph), 2)}")
# print(f"The small-world coefficient omega is {nx.omega(gh_graph)}")

Average shortest path length is 4.895764638205489
Natural log of number of nodes is 9.878067175189218
Log base 10 of number of nodes is 4.289990066054319
Log base 2 of number of nodes is 14.251038527213536


dict_degrees = dict(nx.degree(gh_graph))
degree_vals = dict_degrees.values()
unique, count = np.unique(list(degree_vals), return_counts=True)

plt.scatter(unique, count)
plt.xscale('log', basex=2)
plt.yscale('log',basey=2)
plt.xlabel('log(degree)')
plt.ylabel('log(frequency)')

Text(0, 0.5, 'log(frequency)')


This graph shows power law behavior pretty clearly. What a nice line!


dict_degrees = dict(nx.degree(gh_graph))
degree_vals = dict_degrees.values()
unique, count = np.unique(list(degree_vals), return_counts=True)

clust_degree_dict = {}
for node, degree in dict_degrees.items():
    if clust_degree_dict.get(degree) is None:
        clust_degree_dict[degree] = [0,0]
    clust_degree_dict[degree][0] += clustering_coeffs[node]
    clust_degree_dict[degree][1] += 1

final_dict = {}
for degree, clust in clust_degree_dict.items():
    count = clust[1]
    sum = clust[0]
    final_dict[degree] = sum/count

plt.scatter(final_dict.keys(), final_dict.values())
plt.xscale('log', basex=2)
plt.yscale('log',basey=2)
plt.xlabel('log(degree)')
plt.ylabel('log(average clustering coefficient)')

Text(0, 0.5, 'log(average clustering coefficient)')

Introduction¶

Functions¶

Analyses¶

Preprocessing¶

Produce a visualization of the graph (or graph sample that you used)¶

Find the 10 nodes with the highest degree.¶

Find the 10 nodes with the highest betweenness centrality.¶

Find the 10 nodes with the highest clustering coefficient. If there are ties, choose 10 to report and explain how the 10 were chosen.¶

Find the top 10 nodes as ranked by eigenvector centrality¶

Find the top 10 nodes as ranked by Pagerank¶

Comment on the differences and similarities in questions 12-16. Are the highly ranked nodes mostly the same? Do you notice significant differences in the rankings? Why do you think this is the case?¶

Compute the average shortest path length in the graph. Based on your result, does the graph exhibit small-world behavior?¶

Plot the degree distribution of the graph on a log-log-scale. Does the graph exhibit power law behavior? Include the plot and the code used to generate it in your submission.¶