Parsers I've tested:
- HtmlAgilityPack
- SgmlReader + XmlDocument
- SgmlReader + XElement
- SgmlReader + XPathDocument
- SgmlReader + VDT-XML
Test scenario
I've selected some average web page to test, that is almost xhtml 1.0 compilant. Size is 118kb. It was downloaded using HttpWebRequest, and response stream was passed to testing functions. Before parsing, I was every time converting encoding. I did it from UTF-8 to UTF-8, to it might make no sens, but in real scenario I will do it often from other encodings so it should be measured in benchmark. After every document load, I was evaluating three XPath queries, as my real-life applications do. When using SgmlReader I had to remove xmlns declarations, as my XPath queries are written in XPath 1.0 and don't include namespaces. If you can write you XPath using namespaces, you can skip that step and pass SgmlReader directly to parser as an input - it might improve performance a little.HtmlAgilityPack test code
for (int i = 0; i < total; i++)
{
byte[] outputBytes = Encoding.Convert(Encoding.GetEncoding("UTF-8"), Encoding.UTF8, inputms.ToArray());
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(Encoding.UTF8.GetString(outputBytes));
foreach (HtmlNode el in doc.DocumentNode.SelectNodes(xpath))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.InnerText);
}
foreach (HtmlNode el in doc.DocumentNode.SelectNodes(xpath2))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.InnerText);
}
foreach (HtmlNode el in doc.DocumentNode.SelectNodes(xpath3))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.InnerText);
}
if (i % 100 == 0)
{
Console.WriteLine("Memory usage:" + GC.GetTotalMemory(true) / 1024 + " kb");
}
}
SgmlReader + XmlDocument code
for (int i = 0; i < total; i++)
{
byte[] outputBytes = Encoding.Convert(Encoding.GetEncoding("UTF-8"), Encoding.UTF8, inputms.ToArray());
MemoryStream outputms = new MemoryStream();
StreamWriter sw2 = new StreamWriter(outputms);
sw2.Write(Encoding.UTF8.GetString(outputBytes));
sw2.Flush();
outputms.Seek(0, SeekOrigin.Begin);
// setup SgmlReader
Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
sgmlReader.IgnoreDtd = true;
sgmlReader.InputStream = new StreamReader(outputms);
string strXMLPattern = @"xmlns(:\w+)?="".+""";
string htmlCleaned = Regex.Replace(sgmlReader.ReadOuterXml(), strXMLPattern, "");
XmlDocument doc = new XmlDocument();
doc.LoadXml(htmlCleaned);
foreach (XmlNode el in doc.DocumentElement.SelectNodes(xpath))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
foreach (XmlNode el in doc.DocumentElement.SelectNodes(xpath2))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
foreach (XmlNode el in doc.DocumentElement.SelectNodes(xpath3))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
if (i % 100 == 0)
{
Console.WriteLine("Memory usage:" + GC.GetTotalMemory(true) / 1024 + " kb");
}
}
SgmlReader + XElement code
for (int i = 0; i < total; i++)
{
byte[] outputBytes = Encoding.Convert(Encoding.GetEncoding("UTF-8"), Encoding.UTF8, inputms.ToArray());
MemoryStream outputms = new MemoryStream();
StreamWriter sw2 = new StreamWriter(outputms);
sw2.Write(Encoding.UTF8.GetString(outputBytes));
sw2.Flush();
outputms.Seek(0, SeekOrigin.Begin);
// setup SgmlReader
Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
sgmlReader.IgnoreDtd = true;
sgmlReader.InputStream = new StreamReader(outputms);
string strXMLPattern = @"xmlns(:\w+)?="".+""";
string htmlCleaned = Regex.Replace(sgmlReader.ReadOuterXml(), strXMLPattern, "");
using (StringReader sr3 = new StringReader(htmlCleaned))
{
XElement ele = XElement.Load(sr3);
foreach (XElement el in ele.XPathSelectElements(xpath))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
foreach (XElement el in ele.XPathSelectElements(xpath2))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
foreach (XElement el in ele.XPathSelectElements(xpath3))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
if (i % 100==0) {
Console.WriteLine("Memory usage:" + GC.GetTotalMemory(true)/1024 + " kb" );
}
}
}
SgmlReader + XPathDocument code
for (int i = 0; i < total; i++)
{
byte[] outputBytes = Encoding.Convert(Encoding.GetEncoding("UTF-8"), Encoding.UTF8, inputms.ToArray());
MemoryStream outputms = new MemoryStream();
StreamWriter sw2 = new StreamWriter(outputms);
sw2.Write(Encoding.UTF8.GetString(outputBytes));
sw2.Flush();
outputms.Seek(0, SeekOrigin.Begin);
// setup SgmlReader
Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
sgmlReader.IgnoreDtd = true;
sgmlReader.InputStream = new StreamReader(outputms);
string strXMLPattern = @"xmlns(:\w+)?="".+""";
string htmlCleaned = Regex.Replace(sgmlReader.ReadOuterXml(), strXMLPattern, "");
using (StringReader sr3 = new StringReader(htmlCleaned))
{
XPathDocument doc = new XPathDocument(sr3);
XPathNavigator nav = doc.CreateNavigator();
foreach (XPathNavigator el in nav.Select(xpath))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
foreach (XPathNavigator el in nav.Select(xpath2))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
foreach (XPathNavigator el in nav.Select(xpath3))
{
if (i % 100 == 0) Console.WriteLine(i + ": " + el.Value);
}
if (i % 100 == 0)
{
Console.WriteLine("Memory usage:" + GC.GetTotalMemory(true) / 1024 + " kb");
}
}
}
SgmlReader + VTD-XML code
VTDGen vg = new VTDGen();
AutoPilot ap = new AutoPilot();
ap.selectXPath(xpath);
AutoPilot ap2 = new AutoPilot();
ap2.selectXPath(xpath2);
AutoPilot ap3 = new AutoPilot();
ap3.selectXPath(xpath3);
for (int i = 0; i < total; i++)
{
byte[] outputBytes = Encoding.Convert(Encoding.GetEncoding("UTF-8"), Encoding.UTF8, inputms.ToArray());
MemoryStream outputms = new MemoryStream();
StreamWriter sw2 = new StreamWriter(outputms);
sw2.Write(Encoding.UTF8.GetString(outputBytes));
sw2.Flush();
outputms.Seek(0, SeekOrigin.Begin);
// setup SgmlReader
Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
sgmlReader.IgnoreDtd = true;
sgmlReader.InputStream = new StreamReader(outputms);
int a;
vg.setDoc_BR(Encoding.UTF8.GetBytes(sgmlReader.ReadOuterXml()));
//vg.setDoc(Encoding.UTF8.GetBytes(sgmlReader.ReadOuterXml()));
vg.parse(false);
VTDNav vn = vg.getNav();
ap.bind(vn);
while ((a = ap.evalXPath()) != -1)
{
if (i % 100 == 0)
{
int ie = vn.getText();
Console.WriteLine(i + ": " + vn.toString(ie));
}
}
ap.resetXPath();
ap2.bind(vn);
while ((a = ap2.evalXPath()) != -1)
{
if (i % 100 == 0)
{
int ie = vn.getText();
Console.WriteLine(i + ": " + vn.toString(ie));
}
}
ap2.resetXPath();
ap3.bind(vn);
while ((a = ap3.evalXPath()) != -1)
{
if (i % 100 == 0)
{
int ie = vn.getText();
if (ie > -1)
{
Console.WriteLine(i + ": " + vn.toString(ie));
}
}
}
ap3.resetXPath();
if (i % 100 == 0)
{
Console.WriteLine("Memory usage:" + GC.GetTotalMemory(true) / 1024 + " kb");
}
}
Performance results
Test has been made on Intel C2D E8400 3GHz 6MB cache CPU with 4GB ram, on single thread application. It was important for me, to get results in environment similar to small instances of Azure cloud.I tested parsing of same document 500 times, one by one, as you can see in the code.
- HtmlAgilityPack: 31,25 parses/ second, memory usage: 1754kb
- SgmlReader + XmlDocument: 38,64 parses / second, memory usage: 1484kb
- SgmlReader + XElement : 55,56 parses / second, memory usage: 1422kb
- SgmlReader + XPathDocument: 62,50 parses / second, memory usage: 1415kb
- SgmlReader + VTD-XML: 27,78 parses / second, memory usage: 1484kb
The winner is: XPathDocument
As Microsoft states, it "provides a fast, read-only, in-memory representation of an XML document by using the XPath data model" and it seems to be true :) It outperforms most popular html parsing class, the HtmlAgilityPack, almost twice.
A big disapointment is VTD-XML. Results are poor even though code was written following official recommendations. I found in web some opinions, that C# port might be slower.