plot_roget.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #!/usr/bin/env python
  2. """
  3. =====
  4. Roget
  5. =====
  6. Build a directed graph of 1022 categories and
  7. 5075 cross-references as defined in the 1879 version of Roget's Thesaurus
  8. contained in the datafile roget_dat.txt. This example is described in
  9. Section 1.2 in Knuth's book (see [1]_ and [2]_).
  10. Note that one of the 5075 cross references is a self loop yet
  11. it is included in the graph built here because
  12. the standard networkx `DiGraph` class allows self loops.
  13. (cf. 400pungency:400 401 403 405).
  14. References
  15. ----------
  16. .. [1] Donald E. Knuth,
  17. "The Stanford GraphBase: A Platform for Combinatorial Computing",
  18. ACM Press, New York, 1993.
  19. .. [2] http://www-cs-faculty.stanford.edu/~knuth/sgb.html
  20. """
  21. # Authors: Brendt Wohlberg, Aric Hagberg (hagberg@lanl.gov)
  22. # Date: 2005-04-01 07:56:22 -0700 (Fri, 01 Apr 2005)
  23. # Copyright (C) 2004-2019 by
  24. # Aric Hagberg <hagberg@lanl.gov>
  25. # Dan Schult <dschult@colgate.edu>
  26. # Pieter Swart <swart@lanl.gov>
  27. # All rights reserved.
  28. # BSD license.
  29. import gzip
  30. import re
  31. import sys
  32. import matplotlib.pyplot as plt
  33. from networkx import nx
  34. def roget_graph():
  35. """ Return the thesaurus graph from the roget.dat example in
  36. the Stanford Graph Base.
  37. """
  38. # open file roget_dat.txt.gz (or roget_dat.txt)
  39. fh = gzip.open('roget_dat.txt.gz', 'r')
  40. G = nx.DiGraph()
  41. for line in fh.readlines():
  42. line = line.decode()
  43. if line.startswith("*"): # skip comments
  44. continue
  45. if line.startswith(" "): # this is a continuation line, append
  46. line = oldline + line
  47. if line.endswith("\\\n"): # continuation line, buffer, goto next
  48. oldline = line.strip("\\\n")
  49. continue
  50. (headname, tails) = line.split(":")
  51. # head
  52. numfind = re.compile("^\d+") # re to find the number of this word
  53. head = numfind.findall(headname)[0] # get the number
  54. G.add_node(head)
  55. for tail in tails.split():
  56. if head == tail:
  57. print("skipping self loop", head, tail, file=sys.stderr)
  58. G.add_edge(head, tail)
  59. return G
  60. if __name__ == '__main__':
  61. G = roget_graph()
  62. print("Loaded roget_dat.txt containing 1022 categories.")
  63. print("digraph has %d nodes with %d edges"
  64. % (nx.number_of_nodes(G), nx.number_of_edges(G)))
  65. UG = G.to_undirected()
  66. print(nx.number_connected_components(UG), "connected components")
  67. options = {
  68. 'node_color': 'black',
  69. 'node_size': 1,
  70. 'line_color': 'grey',
  71. 'linewidths': 0,
  72. 'width': 0.1,
  73. }
  74. nx.draw_circular(UG, **options)
  75. plt.show()