5.2. XML Module lxml
pip install lxml
5.2.1. Creating elements
Creating elements:
from lxml.etree import tostring, Element
root = Element("iris")
print(tostring(root))
# b'<iris/>'
Adding elements using list interface:
from lxml.etree import tostring, Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
print(tostring(root))
# b'<iris><setosa/><versicolor/><virginica/></iris>'
5.2.2. Length of a subtree
Length of a subtree:
from lxml.etree import Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
print(len(root))
# 3
5.2.3. Selecting subtree
Selecting subtree:
from lxml.etree import Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
selected = root[2]
print(selected.tag)
# virginica
Where is selected element:
from lxml.etree import Element
root = Element('iris')
root.append(Element('setosa'))
root.append(Element('versicolor'))
root.append(Element('virginica'))
selected = root[1]
root.index(selected)
# 1
selected = root[2]
root.index(selected)
# 2
5.2.4. Element tree as a lists
Elements are lists:
from lxml.etree import tostring, Element
root = Element('iris")
root.append(Element('setosa"))
root.append(Element('versicolor"))
root.append(Element('virginica"))
children = list(root)
print(children)
# [
# <Element setosa at 0x113cd4048>,
# <Element versicolor at 0x113cd4188>,
# <Element virginica at 0x113cd41c8>
# ]
Iterating over elements:
from lxml.etree import Element
root = Element("iris")
root.append(Element("setosa"))
root.append(Element("versicolor"))
root.append(Element("virginica"))
for child in root:
print(child.tag)
# setosa
# versicolor
# virginica
Slicing elements:
from lxml.etree import Element
root = Element("iris")
root.append(Element("setosa"))
root.append(Element("versicolor"))
root.append(Element("virginica"))
root.insert(0, Element("arctica"))
start = root[:1]
end = root[-1:]
print(start[0].tag) # arctica
print(end[0].tag) # virginica
5.2.5. Elements as a dict
Create element using dict
interface:
from lxml.etree import tostring, Element
tag = Element("iris", kingdom="plantae")
print(tostring(tag))
# b'<iris kingdom="plantae"/>'
Get element attributes and values:
from lxml.etree import tostring, Element
tag = Element("iris", kingdom="plantae")
print(tag.get("kingdom")) # plantae
print(tag.get("not-existing")) # None
Set element attributes and values:
from lxml.etree import tostring, Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
print(tag.get("kind"))
# flower
print(tostring(tag))
# b'<iris kingdom="plantae" kind="flower"/>'
Elements carry attributes as a dict:
from lxml.etree import Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
tag.keys()
# ['kind', 'kingdom']
tag.values()
# ['plantae', 'flower']
tag.items()
# [('kingdom', 'plantae'), ('kind', 'flower')]
Iterating over element attributes and values:
from lxml.etree import Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
for key, value in tag.items():
print(f'{key} -> {value}')
# kingdom -> plantae
# kind -> flower
Elements carry attributes as a dict:
from lxml.etree import Element
tag = Element("iris", kingdom="plantae")
tag.set("kind", "flower")
tag.attrib['kingdom']
# 'plantae'
tag.attrib['not-existing']
# Traceback (most recent call last):
# KeyError: 'not-existing'
tag.attrib['species'] = 'Setosa'
tag.attrib.get('species')
# 'Setosa'
tag.attrib
# {'kingdom': 'plantae', 'kind': 'flower'}
tag.attrib.items()
# [('kingdom', 'plantae'), ('kind', 'flower'), ('species', 'Setosa')]
5.2.6. Elements contain text
from lxml.etree import tostring, Element
tag = Element("iris")
tag.text = "Setosa"
tag.text
# 'Setosa'
tostring(tag)
# b'<iris>Setosa</iris>'
5.2.7. Tree iteration
from lxml.etree import tostring, Element, SubElement
root = Element("iris")
SubElement(root, "species").text = "Setosa"
SubElement(root, "species").text = "Virginica"
SubElement(root, "flower").text = "Versicolor"
print(tostring(root, pretty_print=True))
# b'<iris>
# <species>Setosa</species>
# <species>Virginica</species>
# <flower>Versicolor</flower>
# </iris>'
for element in root.iter():
print(f'{element.tag} -> {element.text}')
# iris -> None
# species -> Setosa
# species -> Virginica
# flower -> Versicolor
for element in root.iter("species"):
print(f'{element.tag} -> {element.text}')
# species -> Setosa
# species -> Virginica
for element in root.iter("species", "flower"):
print(f'{element.tag} -> {element.text}')
# species -> Setosa
# species -> Virginica
# flower -> Versicolor
5.2.8. Entities
from lxml.etree import tostring, Element, SubElement, Entity
root = Element("iris")
print(tostring(root))
# b'<iris/>'
root.append(Entity("#234"))
print(tostring(root))
# b'<iris>ê</iris>'
5.2.10. Serialization
from lxml.etree import tostring, XML
root = XML('<root><a><b/></a></root>')
tostring(root)
# b'<root><a><b/></a></root>'
print(tostring(root, xml_declaration=True))
# b"<?xml version='1.0' encoding='ASCII'?>\n<root><a><b/></a></root>"
print(tostring(root, encoding='utf-8'))
# b'<root><a><b/></a></root>'
print(tostring(root, encoding='iso-8859-2'))
# b"<?xml version='1.0' encoding='iso-8859-2'?>\n<root><a><b/></a></root>"
print(tostring(root, pretty_print=True))
# b'<root>\n <a>\n <b/>\n </a>\n</root>\n'
print(tostring(root, pretty_print=True).decode())
# <root>
# <a>
# <b/>
# </a>
# </root>
from lxml.etree import tostring, XML
root = XML('<html><head/><body><p>Hello<br/>World</p></body></html>')
# default: method = 'xml'
tostring(root)
# b'<html><head/><body><p>Hello<br/>World</p></body></html>'
tostring(root, method='xml')
# b'<html><head/><body><p>Hello<br/>World</p></body></html>'
tostring(root, method='html')
# b'<html><head></head><body><p>Hello<br>World</p></body></html>'
print(tostring(root, method='html', pretty_print=True))
# b'<html>\n<head></head>\n<body><p>Hello<br>World</p></body>\n</html>\n'
print(tostring(root, method='html', pretty_print=True).decode())
# <html>
# <head></head>
# <body><p>Hello<br>World</p></body>
# </html>
tostring(root, method='text')
# b'HelloWorld'
5.2.9. Comments