Browse Source

basic script for inferring info about XML structures

Getty Ritter 5 years ago
commit
60643125b2
1 changed files with 92 additions and 0 deletions
  1. 92 0
      infer-xml.py

+ 92 - 0
infer-xml.py

@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+
+import xml.etree.ElementTree as xml
+import sys
+
+class Tagdata:
+    def __init__(self):
+        self.attrs = {}
+        self.elems = {}
+        self.content = set()
+
+    def add_attr(self, k, v):
+        (optional, samples) = self.attrs.get(k, (False, set()))
+        self.attrs[k] = (optional, samples | set([v]))
+
+    def add_elem(self, elem):
+        self.elems[elem.tag] = self.elems.get(elem.tag, False)
+
+    def add_content(self, stuff):
+        self.content |= set(stuff)
+
+    def make_attrs_optional(self, attrs):
+        missing = set(self.attrs) - set(attrs)
+        for m in missing:
+            (_, samples) = self.attrs[m]
+            self.attrs[m] = (True, samples)
+
+    def make_elems_optional(self, elems):
+        missing = set(self.elems) - set(elems)
+        for m in missing:
+            self.elems[m] = True
+
+    def __repr__(self):
+        return 'Tagdata(attrs={}, elems={})'.format(
+            self.attrs, self.elems)
+
+class Traverse:
+    def __init__(self):
+        self.cache = {}
+
+    def add_first_element(self, elem):
+        self.cache[elem.tag] = tag = Tagdata()
+
+        for k, v in elem.attrib.items():
+            tag.add_attr(k, v)
+
+        for child in elem:
+            tag.add_elem(child)
+            self.add_element(child)
+
+    def add_subsequent_element(self, elem):
+        tag = self.cache[elem.tag]
+
+        for k, v in elem.attrib.items():
+            tag.add_attr(k, v)
+
+        for child in elem:
+            tag.add_elem(elem)
+            self.add_element(child)
+
+        tag.make_attrs_optional(elem.attrib.keys())
+        tag.make_elems_optional([e.tag for e in elem])
+
+    def add_element(self, elem):
+        if elem.tag not in self.cache:
+            self.add_first_element(elem)
+        else:
+            self.add_subsequent_element(elem)
+
+
+def main(path):
+    t = Traverse()
+    t.add_element(xml.parse(path).getroot())
+    for k, v in t.cache.items():
+        print('tag {0}'.format(k))
+        if v.attrs:
+            for (attr, (optional, sample)) in v.attrs.items():
+                print('  - attr {0} ({1})'.format(
+                    attr, 'optional' if optional else 'mandatory'
+                ))
+        if v.elems:
+            for (elem, optional) in v.elems.items():
+                print('  - child {0} ({1})'.format(
+                    elem, 'optional' if optional else 'mandatory'
+                ))
+
+if __name__ == '__main__':
+    if sys.argv[1:]:
+        main(sys.argv[1])
+    else:
+        sys.stderr.write(
+            'usage: {0} [file.xml]\n'.format(sys.argv[0]))