Module minidom_ext

Functions to improve xml.dom.minidom tools in Python.

Expand source code
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-

"""
        Functions to improve xml.dom.minidom tools in Python.
"""
import os
from xml.dom.minidom import Node, Element, parse, parseString
import re
from lxml import etree  # http://lxml.de/index.html#documentation

# pdoc3 --html --force minidom_ext.py

#==================================================
#============ Tools ===============================
#==================================================

def existFile(f):
        """ tests if the file exists """
        return os.path.isfile(f)

def existDir(d):
        """ tests if the directory exists """
        return os.path.exists(d)

#==================================================
#============ class DOMCompanion ==================
#==================================================

class DOMCompanion :
        """
                Functions to improve xml.dom.minidom tools in Python.

                Attributes
                ----------
                doc : Node.DOCUMENT_NODE
                        the DOM structure
                documentElement : Node.ElEMENT_NODE
                        equivalent to doc.documentElement
        """

        # ===========================================================================================
        def __init__(self, doc = None) :
                """
                        class constructor.

                        Parameters
                        ----------
                        doc : Node.DOCUMENT_NODE, optional
                                DOM structure

                        Notes
                        -----
                        The DOM is also enriched with default attributes if a DTD is specified
                """
                self.doc = doc
                self.lid = dict()
                if doc is not None :
                        self.documentElement = doc.documentElement
                        self.enrichXML()
                else :
                        self.documentElement = None

        # ===========================================================================================
        def parse(self, file, validate = False):
                """ 
                        to load an XML file

                        Parameters
                        ----------
                        file : str
                                file that contains the XML file to load

                        validate : boolean, optional
                                flag to validate the XML file if it contains a Doctype section

                        See Also
                        --------
                        `DOMCompanion.validate`

                        Notes
                        -----
                        if a DTD is specified, uses it to add default attributes and to collect IDs
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                if existFile(file) :
                        self.doc = parse(file)
                        self.documentElement = self.doc.documentElement
                        if validate :
                                self.validate()
                        else :
                                self.__enrichXML()

        # ===========================================================================================
        def parseString(self, xml, validate = False):
                """ 
                        to load an XML string

                        Parameters
                        ----------
                        xml : str
                                the string that contains the XML

                        validate : boolean, optional
                                flag to validate the XML file if it contains a Doctype section

                        See Also
                        --------
                        `DOMCompanion.validate`

                        Notes
                        -----
                        if a DTD is specified, uses it to add default attributes and to collect IDs
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                self.doc = parseString(xml)
                self.documentElement = self.doc.documentElement
                if validate :
                        self.validate()
                else :
                        self.__enrichXML()

        # ===========================================================================================
        def getElementsByTagName(self, name) :
                """
                        the DOM getElementsByTagName

                        Parameters
                        ----------
                        name : str
                                the Element to find in the DOM

                        Returns
                        -------
                        NodeList or None
                                a list of elements or None
                """
                if self.doc is not None :
                        return self.doc.getElementsByTagName(name)
                else:
                        return None

        # ===========================================================================================
        def getElementById(self, id) :
                """
                        to retrieve an element by its ID
                        
                        Parameters
                        ----------
                        id : str
                                the ID of the element to find

                        Returns
                        -------
                        Node.ELEMENT_NODE or None
                                the element or None
                """
                if id in self.lid.keys() :
                        return self.lid[id]
                else :
                        return None

        # ===========================================================================================
        def toLighter(self, del_spaces = True, del_comments = True, del_pi = True) :
                """
                        to suppress text nodes (with only separators), processing instructions and/or comments

                        Parameters
                        ----------
                        del_spaces : boolean, optional
                                to suppress blank nodes (with only newline, tabulation et space caracters)
                        del_comments : boolean, optional
                                to suppress comment nodes
                        del_spaces : boolean, optional
                                to suppress processing instruction nodes

                        Returns
                        -------
                        DOMCompaniom
                                itself
                """
                if self.doc is not None :
                        self.__purgeDOM(self.doc, del_spaces, del_comments, del_pi)
                return self

        # ===========================================================================================
        def validate(self) :
                """
                        to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document.

                        Returns
                        -------
                        boolean
                                the DOM is valid or not according to the specified DTD
                """
                if self.doc is not None :
                        parser = etree.XMLParser(recover=True, strip_cdata=True)
                        tree = etree.XML(self.doc.toxml(), parser)
                        dtdFile = self.doc.doctype.systemId
                        if dtdFile is not None :
                                if existFile(dtdFile) :
                                        dtd = etree.DTD(dtdFile)
                                        if dtd.validate(tree) :
                                                self.__enrichXML()
                                                return True
                                        else :
                                                print(dtd.error_log.filter_from_errors()[0])
                                                return False
                                else :
                                        print('Unable ti find the DTD file ',dtdFile)
                                        return False
                        else:
                                return True
                else :
                        return False

        # ===========================================================================================
        def toxml(self) :
                """
                        produce XML string

                        Returns
                        -------
                        str
                                the XML string

                        Notes
                        -----
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                return self.doc.toxml()

        # ===========================================================================================
        def toprettyxml(self,indent="\t", newl="\n", encoding=None, standalone=None) :
                """
                        produce pretty-printed version of the XML string

                        Returns
                        -------
                        str
                                the XML string

                        Notes
                        -----
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                return self.doc.toprettyxml(ident, newl, encoding, standalone)

        # ===========================================================================================
        #####################################
        ########## private methods ##########
        #####################################

        def __enrichXML(self) :
                if self.doc is not None :
                        self.lid = dict()
                        dtdFile = self.doc.doctype.systemId
                        if dtdFile is not None :
                                if existFile(dtdFile) :
                                        le = self.__extractDTD(dtdFile)
                                        self.__enrichNode(self.doc.documentElement, le)
                                else :
                                        print('Unable ti find the DTD file ',dtdFile)

        def __purgeDOM(self, no, del_spaces, del_comments, del_pi) :
                if no.nodeType in [Node.ELEMENT_NODE, Node.DOCUMENT_NODE] :
                        toDel = []
                        for n in no.childNodes :
                                if del_spaces and n.nodeType == Node.TEXT_NODE and n.data.strip('\t \n') == '' :
                                        toDel.append(n)
                                elif del_comments and n.nodeType == Node.COMMENT_NODE :
                                        toDel.append(n)
                                elif del_pi and n.nodeType == Node.PROCESSING_INSTRUCTION_NODE :
                                        toDel.append(n)
                                elif n.nodeType == Node.ELEMENT_NODE :
                                        self.__purgeDOM(n,del_spaces,del_comments, del_pi)
                        for n in toDel :
                                no.removeChild(n)
                elif no.nodeType == Node.DOCUMENT_TYPE_NODE :
                        pass
                else :
                        pass
                return no


        def __getDTD(self, file) :
                if existFile(file) :
                        f = open(file,'r')
                        dtd = f.read()
                        f.close()
                        return dtd
                else :
                        return None


        def __extractDTD(self, file) :

                el = re.compile(r'<!ELEMENT (?P<elementname>[\w\-\:\_]+) (?P<description>.*)\s*>')
                att = re.compile(r'<!ATTLIST (?P<elementname>[\w\-\:\_]+) (?P<attributs>.*)\s*>')
                att2 = re.compile(r'(?P<attname>[\w\-\:\_]+) (?P<def>.*?) (?P<status>#[\w\-\:\_]+|[\"\'].*?[\"\'])')
                comment = re.compile(r'<!-- \.*? -->')

                dtd = self.__getDTD(file).replace('\n',' ').replace('\t',' ')
                cp = re.compile(r'<.*?>')
                liste_elem = dict()
                for item in cp.findall(dtd) :
                        cmnt = comment.match(item)
                        if cmnt is not None :
                                pass
                        else :
                                grp = el.match(item)
                                if grp is not None :
                                        nomElem = grp.group('elementname').strip('\t \n')
                                        liste_elem[nomElem] = dict()
                                else :
                                        grp = att.match(item)
                                        if grp is not None :
                                                nomElem = grp.group('elementname')
                                                for (nom, definition, status) in att2.findall(grp.group('attributs')) :
                                                        nomAtt = nom.strip('\t \n')
                                                        definition = definition.strip('\t \n')
                                                        status = status.strip('\t \n')
                                                        liste_elem[nomElem][nomAtt] = (definition, status.replace("'",'').replace('"',''))
                return liste_elem


        def __enrichNode(self, node, le) :
                if node.nodeType == Node.ELEMENT_NODE :
                        la = le[node.tagName]
                        for (att, (definition, status)) in la.items() :
                                if definition == 'ID' :
                                        nid = node.getAttribute(att)
                                        self.lid[nid] = node
                                if node.hasAttribute(att) :
                                        pass
                                else :
                                        if '#' not in status :
                                                node.setAttribute(att,status) 
                        for n in node.childNodes :
                                self.__enrichNode(n,le)


if ( __name__ == "__main__"):
        cine = DOMCompanion()
        cine.parse("semaine10.xml", True)
        print(cine.doc.toxml())
        print(cine.getElementById('Ka'))
        print(cine.toLighter().toxml())

Functions

def existDir(d)

tests if the directory exists

Expand source code
def existDir(d):
        """ tests if the directory exists """
        return os.path.exists(d)
def existFile(f)

tests if the file exists

Expand source code
def existFile(f):
        """ tests if the file exists """
        return os.path.isfile(f)

Classes

class DOMCompanion (doc=None)

Functions to improve xml.dom.minidom tools in Python.

Attributes

doc : Node.DOCUMENT_NODE
the DOM structure
documentElement : Node.ElEMENT_NODE
equivalent to doc.documentElement

class constructor.

Parameters

doc : Node.DOCUMENT_NODE, optional
DOM structure

Notes

The DOM is also enriched with default attributes if a DTD is specified

Expand source code
class DOMCompanion :
        """
                Functions to improve xml.dom.minidom tools in Python.

                Attributes
                ----------
                doc : Node.DOCUMENT_NODE
                        the DOM structure
                documentElement : Node.ElEMENT_NODE
                        equivalent to doc.documentElement
        """

        # ===========================================================================================
        def __init__(self, doc = None) :
                """
                        class constructor.

                        Parameters
                        ----------
                        doc : Node.DOCUMENT_NODE, optional
                                DOM structure

                        Notes
                        -----
                        The DOM is also enriched with default attributes if a DTD is specified
                """
                self.doc = doc
                self.lid = dict()
                if doc is not None :
                        self.documentElement = doc.documentElement
                        self.enrichXML()
                else :
                        self.documentElement = None

        # ===========================================================================================
        def parse(self, file, validate = False):
                """ 
                        to load an XML file

                        Parameters
                        ----------
                        file : str
                                file that contains the XML file to load

                        validate : boolean, optional
                                flag to validate the XML file if it contains a Doctype section

                        See Also
                        --------
                        `DOMCompanion.validate`

                        Notes
                        -----
                        if a DTD is specified, uses it to add default attributes and to collect IDs
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                if existFile(file) :
                        self.doc = parse(file)
                        self.documentElement = self.doc.documentElement
                        if validate :
                                self.validate()
                        else :
                                self.__enrichXML()

        # ===========================================================================================
        def parseString(self, xml, validate = False):
                """ 
                        to load an XML string

                        Parameters
                        ----------
                        xml : str
                                the string that contains the XML

                        validate : boolean, optional
                                flag to validate the XML file if it contains a Doctype section

                        See Also
                        --------
                        `DOMCompanion.validate`

                        Notes
                        -----
                        if a DTD is specified, uses it to add default attributes and to collect IDs
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                self.doc = parseString(xml)
                self.documentElement = self.doc.documentElement
                if validate :
                        self.validate()
                else :
                        self.__enrichXML()

        # ===========================================================================================
        def getElementsByTagName(self, name) :
                """
                        the DOM getElementsByTagName

                        Parameters
                        ----------
                        name : str
                                the Element to find in the DOM

                        Returns
                        -------
                        NodeList or None
                                a list of elements or None
                """
                if self.doc is not None :
                        return self.doc.getElementsByTagName(name)
                else:
                        return None

        # ===========================================================================================
        def getElementById(self, id) :
                """
                        to retrieve an element by its ID
                        
                        Parameters
                        ----------
                        id : str
                                the ID of the element to find

                        Returns
                        -------
                        Node.ELEMENT_NODE or None
                                the element or None
                """
                if id in self.lid.keys() :
                        return self.lid[id]
                else :
                        return None

        # ===========================================================================================
        def toLighter(self, del_spaces = True, del_comments = True, del_pi = True) :
                """
                        to suppress text nodes (with only separators), processing instructions and/or comments

                        Parameters
                        ----------
                        del_spaces : boolean, optional
                                to suppress blank nodes (with only newline, tabulation et space caracters)
                        del_comments : boolean, optional
                                to suppress comment nodes
                        del_spaces : boolean, optional
                                to suppress processing instruction nodes

                        Returns
                        -------
                        DOMCompaniom
                                itself
                """
                if self.doc is not None :
                        self.__purgeDOM(self.doc, del_spaces, del_comments, del_pi)
                return self

        # ===========================================================================================
        def validate(self) :
                """
                        to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document.

                        Returns
                        -------
                        boolean
                                the DOM is valid or not according to the specified DTD
                """
                if self.doc is not None :
                        parser = etree.XMLParser(recover=True, strip_cdata=True)
                        tree = etree.XML(self.doc.toxml(), parser)
                        dtdFile = self.doc.doctype.systemId
                        if dtdFile is not None :
                                if existFile(dtdFile) :
                                        dtd = etree.DTD(dtdFile)
                                        if dtd.validate(tree) :
                                                self.__enrichXML()
                                                return True
                                        else :
                                                print(dtd.error_log.filter_from_errors()[0])
                                                return False
                                else :
                                        print('Unable ti find the DTD file ',dtdFile)
                                        return False
                        else:
                                return True
                else :
                        return False

        # ===========================================================================================
        def toxml(self) :
                """
                        produce XML string

                        Returns
                        -------
                        str
                                the XML string

                        Notes
                        -----
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                return self.doc.toxml()

        # ===========================================================================================
        def toprettyxml(self,indent="\t", newl="\n", encoding=None, standalone=None) :
                """
                        produce pretty-printed version of the XML string

                        Returns
                        -------
                        str
                                the XML string

                        Notes
                        -----
                        See https://docs.python.org/3/library/xml.dom.minidom.html
                """
                return self.doc.toprettyxml(ident, newl, encoding, standalone)

        # ===========================================================================================
        #####################################
        ########## private methods ##########
        #####################################

        def __enrichXML(self) :
                if self.doc is not None :
                        self.lid = dict()
                        dtdFile = self.doc.doctype.systemId
                        if dtdFile is not None :
                                if existFile(dtdFile) :
                                        le = self.__extractDTD(dtdFile)
                                        self.__enrichNode(self.doc.documentElement, le)
                                else :
                                        print('Unable ti find the DTD file ',dtdFile)

        def __purgeDOM(self, no, del_spaces, del_comments, del_pi) :
                if no.nodeType in [Node.ELEMENT_NODE, Node.DOCUMENT_NODE] :
                        toDel = []
                        for n in no.childNodes :
                                if del_spaces and n.nodeType == Node.TEXT_NODE and n.data.strip('\t \n') == '' :
                                        toDel.append(n)
                                elif del_comments and n.nodeType == Node.COMMENT_NODE :
                                        toDel.append(n)
                                elif del_pi and n.nodeType == Node.PROCESSING_INSTRUCTION_NODE :
                                        toDel.append(n)
                                elif n.nodeType == Node.ELEMENT_NODE :
                                        self.__purgeDOM(n,del_spaces,del_comments, del_pi)
                        for n in toDel :
                                no.removeChild(n)
                elif no.nodeType == Node.DOCUMENT_TYPE_NODE :
                        pass
                else :
                        pass
                return no


        def __getDTD(self, file) :
                if existFile(file) :
                        f = open(file,'r')
                        dtd = f.read()
                        f.close()
                        return dtd
                else :
                        return None


        def __extractDTD(self, file) :

                el = re.compile(r'<!ELEMENT (?P<elementname>[\w\-\:\_]+) (?P<description>.*)\s*>')
                att = re.compile(r'<!ATTLIST (?P<elementname>[\w\-\:\_]+) (?P<attributs>.*)\s*>')
                att2 = re.compile(r'(?P<attname>[\w\-\:\_]+) (?P<def>.*?) (?P<status>#[\w\-\:\_]+|[\"\'].*?[\"\'])')
                comment = re.compile(r'<!-- \.*? -->')

                dtd = self.__getDTD(file).replace('\n',' ').replace('\t',' ')
                cp = re.compile(r'<.*?>')
                liste_elem = dict()
                for item in cp.findall(dtd) :
                        cmnt = comment.match(item)
                        if cmnt is not None :
                                pass
                        else :
                                grp = el.match(item)
                                if grp is not None :
                                        nomElem = grp.group('elementname').strip('\t \n')
                                        liste_elem[nomElem] = dict()
                                else :
                                        grp = att.match(item)
                                        if grp is not None :
                                                nomElem = grp.group('elementname')
                                                for (nom, definition, status) in att2.findall(grp.group('attributs')) :
                                                        nomAtt = nom.strip('\t \n')
                                                        definition = definition.strip('\t \n')
                                                        status = status.strip('\t \n')
                                                        liste_elem[nomElem][nomAtt] = (definition, status.replace("'",'').replace('"',''))
                return liste_elem


        def __enrichNode(self, node, le) :
                if node.nodeType == Node.ELEMENT_NODE :
                        la = le[node.tagName]
                        for (att, (definition, status)) in la.items() :
                                if definition == 'ID' :
                                        nid = node.getAttribute(att)
                                        self.lid[nid] = node
                                if node.hasAttribute(att) :
                                        pass
                                else :
                                        if '#' not in status :
                                                node.setAttribute(att,status) 
                        for n in node.childNodes :
                                self.__enrichNode(n,le)

Methods

def getElementById(self, id)

to retrieve an element by its ID

Parameters

id : str
the ID of the element to find

Returns

Node.ELEMENT_NODE or None
the element or None
Expand source code
def getElementById(self, id) :
        """
                to retrieve an element by its ID
                
                Parameters
                ----------
                id : str
                        the ID of the element to find

                Returns
                -------
                Node.ELEMENT_NODE or None
                        the element or None
        """
        if id in self.lid.keys() :
                return self.lid[id]
        else :
                return None
def getElementsByTagName(self, name)

the DOM getElementsByTagName

Parameters

name : str
the Element to find in the DOM

Returns

NodeList or None
a list of elements or None
Expand source code
def getElementsByTagName(self, name) :
        """
                the DOM getElementsByTagName

                Parameters
                ----------
                name : str
                        the Element to find in the DOM

                Returns
                -------
                NodeList or None
                        a list of elements or None
        """
        if self.doc is not None :
                return self.doc.getElementsByTagName(name)
        else:
                return None
def parse(self, file, validate=False)

to load an XML file

Parameters

file : str
file that contains the XML file to load
validate : boolean, optional
flag to validate the XML file if it contains a Doctype section

See Also

DOMCompanion.validate()

Notes

if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html

Expand source code
def parse(self, file, validate = False):
        """ 
                to load an XML file

                Parameters
                ----------
                file : str
                        file that contains the XML file to load

                validate : boolean, optional
                        flag to validate the XML file if it contains a Doctype section

                See Also
                --------
                `DOMCompanion.validate`

                Notes
                -----
                if a DTD is specified, uses it to add default attributes and to collect IDs
                See https://docs.python.org/3/library/xml.dom.minidom.html
        """
        if existFile(file) :
                self.doc = parse(file)
                self.documentElement = self.doc.documentElement
                if validate :
                        self.validate()
                else :
                        self.__enrichXML()
def parseString(self, xml, validate=False)

to load an XML string

Parameters

xml : str
the string that contains the XML
validate : boolean, optional
flag to validate the XML file if it contains a Doctype section

See Also

DOMCompanion.validate()

Notes

if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html

Expand source code
def parseString(self, xml, validate = False):
        """ 
                to load an XML string

                Parameters
                ----------
                xml : str
                        the string that contains the XML

                validate : boolean, optional
                        flag to validate the XML file if it contains a Doctype section

                See Also
                --------
                `DOMCompanion.validate`

                Notes
                -----
                if a DTD is specified, uses it to add default attributes and to collect IDs
                See https://docs.python.org/3/library/xml.dom.minidom.html
        """
        self.doc = parseString(xml)
        self.documentElement = self.doc.documentElement
        if validate :
                self.validate()
        else :
                self.__enrichXML()
def toLighter(self, del_spaces=True, del_comments=True, del_pi=True)

to suppress text nodes (with only separators), processing instructions and/or comments

Parameters

del_spaces : boolean, optional
to suppress blank nodes (with only newline, tabulation et space caracters)
del_comments : boolean, optional
to suppress comment nodes
del_spaces : boolean, optional
to suppress processing instruction nodes

Returns

DOMCompaniom
itself
Expand source code
def toLighter(self, del_spaces = True, del_comments = True, del_pi = True) :
        """
                to suppress text nodes (with only separators), processing instructions and/or comments

                Parameters
                ----------
                del_spaces : boolean, optional
                        to suppress blank nodes (with only newline, tabulation et space caracters)
                del_comments : boolean, optional
                        to suppress comment nodes
                del_spaces : boolean, optional
                        to suppress processing instruction nodes

                Returns
                -------
                DOMCompaniom
                        itself
        """
        if self.doc is not None :
                self.__purgeDOM(self.doc, del_spaces, del_comments, del_pi)
        return self
def toprettyxml(self, indent='\t', newl='\n', encoding=None, standalone=None)

produce pretty-printed version of the XML string

Returns

str
the XML string

Notes

See https://docs.python.org/3/library/xml.dom.minidom.html

Expand source code
def toprettyxml(self,indent="\t", newl="\n", encoding=None, standalone=None) :
        """
                produce pretty-printed version of the XML string

                Returns
                -------
                str
                        the XML string

                Notes
                -----
                See https://docs.python.org/3/library/xml.dom.minidom.html
        """
        return self.doc.toprettyxml(ident, newl, encoding, standalone)
def toxml(self)

produce XML string

Returns

str
the XML string

Notes

See https://docs.python.org/3/library/xml.dom.minidom.html

Expand source code
def toxml(self) :
        """
                produce XML string

                Returns
                -------
                str
                        the XML string

                Notes
                -----
                See https://docs.python.org/3/library/xml.dom.minidom.html
        """
        return self.doc.toxml()
def validate(self)

to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document.

Returns

boolean
the DOM is valid or not according to the specified DTD
Expand source code
def validate(self) :
        """
                to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document.

                Returns
                -------
                boolean
                        the DOM is valid or not according to the specified DTD
        """
        if self.doc is not None :
                parser = etree.XMLParser(recover=True, strip_cdata=True)
                tree = etree.XML(self.doc.toxml(), parser)
                dtdFile = self.doc.doctype.systemId
                if dtdFile is not None :
                        if existFile(dtdFile) :
                                dtd = etree.DTD(dtdFile)
                                if dtd.validate(tree) :
                                        self.__enrichXML()
                                        return True
                                else :
                                        print(dtd.error_log.filter_from_errors()[0])
                                        return False
                        else :
                                print('Unable ti find the DTD file ',dtdFile)
                                return False
                else:
                        return True
        else :
                return False