dalelane / xmldiff.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
########################################################################## |
# |
# xmldiff |
# |
# Simple utility script to enable a diff of two XML files in a way |
# that ignores the order or attributes and elements. |
# |
# Dale Lane (email@dalelane.co.uk) |
# 6 Oct 2014 |
# |
########################################################################## |
# |
# Overview |
# The approach is to sort both files by attribute and element, and |
# then reuse an existing diff implementation on the sorted files. |
# |
# Arguments |
# the command that should be run to diff the sorted files |
# the first XML file to diff |
# the second XML file to diff |
# |
# Background |
# http://dalelane.co.uk/blog/?p=3225 |
# |
########################################################################## |
import os , sys , subprocess , platform |
import lxml . etree as le |
from operator import attrgetter |
# |
# Check required arguments |
if len ( sys . argv ) != 4 : |
print ( «Usage: python xmldiff.py » ) |
quit () |
# |
# Prepares the location of the temporary file that will be created by xmldiff |
def createFileObj ( prefix , name ): |
return |
«filename» : os . path . abspath ( name ), |
«tmpfilename» : «.» + prefix + «.» + os . path . basename ( name ) |
> |
# |
# Function to sort XML elements by id |
# (where the elements have an ‘id’ attribute that can be cast to an int) |
def sortbyid ( elem ): |
id = elem . get ( ‘id’ ) |
if id : |
try : |
return int ( id ) |
except ValueError : |
return 0 |
return 0 |
# |
# Function to sort XML elements by their text contents |
def sortbytext ( elem ): |
text = elem . text |
if text : |
return text |
else : |
return » |
# |
# Function to sort XML attributes alphabetically by key |
# The original item is left unmodified, and it’s attributes are |
# copied to the provided sorteditem |
def sortAttrs ( item , sorteditem ): |
attrkeys = sorted ( item . keys ()) |
for key in attrkeys : |
sorteditem . set ( key , item . get ( key )) |
# |
# Function to sort XML elements |
# The sorted elements will be added as children of the provided newroot |
# This is a recursive function, and will be called on each of the children |
# of items. |
def sortElements ( items , newroot ): |
# The intended sort order is to sort by XML element name |
# If more than one element has the same name, we want to |
# sort by their text contents. |
# If more than one element has the same name and they do |
# not contain any text contents, we want to sort by the |
# value of their ID attribute. |
# If more than one element has the same name, but has |
# no text contents or ID attribute, their order is left |
# unmodified. |
# |
# We do this by performing three sorts in the reverse order |
items = sorted ( items , key = sortbyid ) |
items = sorted ( items , key = sortbytext ) |
items = sorted ( items , key = attrgetter ( ‘tag’ )) |
# Once sorted, we sort each of the items |
for item in items : |
# Create a new item to represent the sorted version |
# of the next item, and copy the tag name and contents |
newitem = le . Element ( item . tag ) |
if item . text and item . text . isspace () == False : |
newitem . text = item . text |
# Copy the attributes (sorted by key) to the new item |
sortAttrs ( item , newitem ) |
# Copy the children of item (sorted) to the new item |
sortElements ( list ( item ), newitem ) |
# Append this sorted item to the sorted root |
newroot . append ( newitem ) |
# |
# Function to sort the provided XML file |
# fileobj.filename will be left untouched |
# A new sorted copy of it will be created at fileobj.tmpfilename |
def sortFile ( fileobj ): |
with open ( fileobj [ ‘filename’ ], ‘r’ ) as original : |
# parse the XML file and get a pointer to the top |
xmldoc = le . parse ( original ) |
xmlroot = xmldoc . getroot () |
# create a new XML element that will be the top of |
# the sorted copy of the XML file |
newxmlroot = le . Element ( xmlroot . tag ) |
# create the sorted copy of the XML file |
sortAttrs ( xmlroot , newxmlroot ) |
sortElements ( list ( xmlroot ), newxmlroot ) |
# write the sorted XML file to the temp file |
newtree = le . ElementTree ( newxmlroot ) |
with open ( fileobj [ ‘tmpfilename’ ], ‘wb’ ) as newfile : |
newtree . write ( newfile , pretty_print = True ) |
# |
# sort each of the specified files |
filefrom = createFileObj ( «from» , sys . argv [ 2 ]) |
sortFile ( filefrom ) |
fileto = createFileObj ( «to» , sys . argv [ 3 ]) |
sortFile ( fileto ) |
# |
# invoke the requested diff command to compare the two sorted files |
if platform . system () == «Windows» : |
sp = subprocess . Popen ([ «cmd» , «/c» , sys . argv [ 1 ] + » » + filefrom [ ‘tmpfilename’ ] + » » + fileto [ ‘tmpfilename’ ] ]) |
sp . communicate () |
else : |
sp = subprocess . Popen ([ «/bin/bash» , «-i» , «-c» , sys . argv [ 1 ] + » » + os . path . abspath ( filefrom [ ‘tmpfilename’ ]) + » » + os . path . abspath ( fileto [ ‘tmpfilename’ ]) ]) |
sp . communicate () |
# |
# cleanup — delete the temporary sorted files after the diff terminates |
os . remove ( filefrom [ ‘tmpfilename’ ]) |
os . remove ( fileto [ ‘tmpfilename’ ]) |
xml_diff 0.7.0
Compares two XML documents by diffing their text, ignoring structure, and wraps changed text in / tags.
Навигация
Ссылки проекта
Статистика
Метаданные
Лицензия: CC0 (copyright waived)
Сопровождающие
Описание проекта
Compares the text inside two XML documents and marks up the differences with and tags.
This is the result of about 7 years of trying to get this right and coded simply. I’ve used code like this in one form or another to compare bill text on GovTrack.us .
The comparison is completely blind to the structure of the two XML documents. It does a word-by-word comparison on the text content only, and then it goes back into the original documents and wraps changed text in new and wrapper elements.
The documents are then concatenated to form a new document and the new document is printed on standard output. Or use this as a library and call compare yourself with two lxml.etree.Element nodes (the roots of your documents).
The script is written in Python 3.
Example
Comparing these two documents:
Here is some bold text.
Here is some italic content that shows how xml_diff works.
Here is some boldtext. Here is some italic content that shows how xml_diff works.
On Ubuntu, get dependencies with:
apt-get install python3-lxml libxml2-dev libxslt1-dev
For really fast comparisons, get Google’s Diff Match Patch library , as re-written and sped-up by @leutloff and then turned into a Python extension module by me :
pip3 install diff_match_patch_python
Or if you can’t install that for any reason, use the pure-Python library:
pip3 install diff-match-patch
Finally, install this module:
Then call the module from the command line:
python3 -m xml_diff --tags del,ins doc1.xml doc2.xml > changes.xml
Or use the module from Python:
import lxml.etree from xml_diff import compare dom1 = lxml.etree.parse("doc1.xml").getroot() dom2 = lxml.etree.parse("doc2.xml").getroot() comparison = compare(dom1, dom2)
The two DOMs are modified in-place.
Optional Arguments
The compare function takes other optional keyword arguments:
merge is a boolean (default false) that indicates whether the comparison function should perform a merge. If true, dom1 will contain not just nodes but also nodes and, similarly, dom2 will contain not just nodes but also nodes. Although the two DOMs will now contain the same semantic information about changes, and the same text content, each preserves their original structure — since the comparison is only over text and not structure. The new ins / del nodes contain content from the other document (including whole subtrees), and so there’s no guarantee that the final documents will conform to any particular structural schema after this operation.
word_separator_regex (default r»\s+|[^\s\w]» ) is a regular expression for how to separate words. The default splits on one or more spaces in a row and single instances of non-word characters.
differ is a function that takes two arguments (text1, text2) and returns an iterator over difference operations given as tuples of the form (operation, text_length) , where operation is one of » docutils literal»>»+» (text inserted into text2 ), or «-» (text deleted from text1 ). (See xml_diff/__init__.py’s default_differ function for how the default differ works.)
tags is a two-tuple of tag names to use for deleted and inserted content. The default is (‘del’, ‘ins’) .
make_tag_func is a function that takes one argument, which is either «ins» or «del» , and returns a new lxml.etree.Element to be inserted into the DOM to wrap changed content. If given, the tags argument is ignored.