plot_words.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. """
  2. =====
  3. Words
  4. =====
  5. Words/Ladder Graph
  6. ------------------
  7. Generate an undirected graph over the 5757 5-letter words in the
  8. datafile `words_dat.txt.gz`. Two words are connected by an edge
  9. if they differ in one letter, resulting in 14,135 edges. This example
  10. is described in Section 1.1 in Knuth's book (see [1]_ and [2]_).
  11. References
  12. ----------
  13. .. [1] Donald E. Knuth,
  14. "The Stanford GraphBase: A Platform for Combinatorial Computing",
  15. ACM Press, New York, 1993.
  16. .. [2] http://www-cs-faculty.stanford.edu/~knuth/sgb.html
  17. """
  18. # Authors: Aric Hagberg (hagberg@lanl.gov),
  19. # Brendt Wohlberg,
  20. # hughdbrown@yahoo.com
  21. # Copyright (C) 2004-2019 by
  22. # Aric Hagberg <hagberg@lanl.gov>
  23. # Dan Schult <dschult@colgate.edu>
  24. # Pieter Swart <swart@lanl.gov>
  25. # All rights reserved.
  26. # BSD license.
  27. import gzip
  28. from string import ascii_lowercase as lowercase
  29. import networkx as nx
  30. #-------------------------------------------------------------------
  31. # The Words/Ladder graph of Section 1.1
  32. #-------------------------------------------------------------------
  33. def generate_graph(words):
  34. G = nx.Graph(name="words")
  35. lookup = dict((c, lowercase.index(c)) for c in lowercase)
  36. def edit_distance_one(word):
  37. for i in range(len(word)):
  38. left, c, right = word[0:i], word[i], word[i + 1:]
  39. j = lookup[c] # lowercase.index(c)
  40. for cc in lowercase[j + 1:]:
  41. yield left + cc + right
  42. candgen = ((word, cand) for word in sorted(words)
  43. for cand in edit_distance_one(word) if cand in words)
  44. G.add_nodes_from(words)
  45. for word, cand in candgen:
  46. G.add_edge(word, cand)
  47. return G
  48. def words_graph():
  49. """Return the words example graph from the Stanford GraphBase"""
  50. fh = gzip.open('words_dat.txt.gz', 'r')
  51. words = set()
  52. for line in fh.readlines():
  53. line = line.decode()
  54. if line.startswith('*'):
  55. continue
  56. w = str(line[0:5])
  57. words.add(w)
  58. return generate_graph(words)
  59. if __name__ == '__main__':
  60. G = words_graph()
  61. print("Loaded words_dat.txt containing 5757 five-letter English words.")
  62. print("Two words are connected if they differ in one letter.")
  63. print("Graph has %d nodes with %d edges"
  64. % (nx.number_of_nodes(G), nx.number_of_edges(G)))
  65. print("%d connected components" % nx.number_connected_components(G))
  66. for (source, target) in [('chaos', 'order'),
  67. ('nodes', 'graph'),
  68. ('pound', 'marks')]:
  69. print("Shortest path between %s and %s is" % (source, target))
  70. try:
  71. sp = nx.shortest_path(G, source, target)
  72. for n in sp:
  73. print(n)
  74. except nx.NetworkXNoPath:
  75. print("None")