Using XMake, run xmake install
on the repository to install the library
(Use xmake f --boost=n
to disable installing the Boost Library beforehand)
Include html-parser/HTMLDocument.h
.
The interface to parse HTML string and get data from it.
Construct a HTMLDocument
object from a std::istream
or string.
using namespace html_parser;
// explicit HTMLDocument::HTMLDocument(std::istream &)
HTMLDocument document1(std::cin);
// explicit HTMLDocument::HTMLDocument(std::istream &&)
HTMLDocument document2(std::ifstream("index.html"));
// explicit HTMLDocument::HTMLDocument(const std::string &)
HTMLDocument document3("<div>a ≤ b</div>");
Parse HTML document from a new string, replacing the current if exists.
using namespace html_parser;
HTMLDocument document(std::cin);
// void HTMLDocument::parse(const std::string &)
document.parse("<div>a ≤ b</div>");
Print the colorized DOM tree of HTML document to the terminal.
using namespace html_parser;
HTMLDocument document("<div>a ≤ b</div>");
// void HTMLDocument::inspect()
document.inspect();
Get all text in the document.
using namespace html_parser;
HTMLDocument document("<div>a ≤ b</div><div>qwq</div>");
// std::string HTMLDocument::getTextContent()
std::string textContent = document.getTextContent();
// textContent = "a ≤ bqwq"
Get all the direct text in the document.
using namespace html_parser;
HTMLDocument document("<span class='myspan'> Don't want this text </span>I want this text");
// std::string HTMLDocument::getDirectTextContent()
std::string directTextContent = document.getDirectTextContent();
// directTextContent = "I want this text"
Get the element whose id
attribute equals to a string. Return a HTMLDocument::Element
object if found, a null HTMLDocument::Element
object if NOT found.
using namespace html_parser;
HTMLDocument document("<div id='my-div'>a ≤ b</div>");
// HTMLDocument::Element HTMLDocument::getElementById(const std::string &)
HTMLDocument::Element div = document.getElementById("my-div");
Get all elements whose name
attribute equal to a string. Return a std::vector<HTMLDocument::Element>
that contains all matching elements.
using namespace html_parser;
HTMLDocument document("<div name='my'>a ≤ b</div><span name='my'>qwq</span>");
// std::vector<HTMLDocument::Element> HTMLDocument::getElementsByName(const std::string &)
std::vector<HTMLDocument::Element> elements = document.getElementsByName("my");
Get all elements whose tag name equals to a string. Return a std::vector<HTMLDocument::Element>
that contains all matching elements.
using namespace html_parser;
HTMLDocument document("<div>a ≤ b</div><div>qwq</div>");
// std::vector<HTMLDocument::Element> HTMLDocument::getElementsByTagName(const std::string &)
std::vector<HTMLDocument::Element> elements = document.getElementsByTagName("div");
Get all elements which have a certain class. Return a std::vector<HTMLDocument::Element>
that contains all matching elements.
using namespace html_parser;
HTMLDocument document("<div class='my-class'>a ≤ b</div><div class='my-class'>qwq</div>");
// std::vector<HTMLDocument::Element> HTMLDocument::getElementsByClassName(const std::string &)
std::vector<HTMLDocument::Element> elements = document.getElementsByClassName("my-class");
Get all child elements of the element upon which it was called. Return a std::vector<HTMLDocument::Element>
that contains all the child elements.
using namespace html_parser;
HTMLDocument document("<span>First</span><span>Second</span>");
// std::vector<HTMLDocument::Element> HTMLDocument::getChildren()
std::vector<HTMLDocument::Element> elements = document.getChildren();
The interface to get data from a HTML element or its subtree.
The default constructor constructs a empty element, on which you do any operation will result in a std::invalid_argument
exception. Check it with if (element)
first.
Print the colorized DOM tree of this element to the terminal.
using namespace html_parser;
HTMLDocument document("<div id='wrapper'><div>a ≤ b</div></div>");
HTMLDocument::Element element = document.getElementById("wrapper");
// void HTMLDocument::Element::inspect()
element.inspect();
Get all text in the element.
using namespace html_parser;
HTMLDocument document("<div id='wrapper'><div>a ≤ b</div><div>qwq</div></div>");
HTMLDocument::Element element = document.getElementById("wrapper");
// std::string HTMLDocument::Element::getTextContent()
std::string textContent = element.getTextContent();
// textContent = "a ≤ b"
Get all the direct text in the element.
using namespace html_parser;
HTMLDocument document("<div id='wrapper'><span class='myspan'> Don't want this text </span>I want this text</div>");
HTMLDocument::Element element = document.getElementById("wrapper");
// std::string HTMLDocument::getDirectTextContent()
std::string directTextContent = element.getDirectTextContent();
// directTextContent = "I want this text"
Get a attribute with specfied name of the element. Return a empty string if not found.
using namespace html_parser;
HTMLDocument document("<div id='wrapper' data-url='/qwq'></div>");
HTMLDocument::Element element = document.getElementById("wrapper");
// std::string HTMLDocument::Element::getAttribute(const std::string &)
std::string value = element.getTextContent("data-url");
// value = "/qwq"
Get all elements whose tag name equals to a string. Return a std::vector<HTMLDocument::Element>
that contains all matching elements.
using namespace html_parser;
HTMLDocument document("<div id='wrapper'><div>a ≤ b</div><div>qwq</div></div>");
HTMLDocument::Element element = document.getElementById("wrapper");
// std::vector<HTMLDocument::Element> HTMLDocument::Element::getElementsByTagName(const std::string &)
std::vector<HTMLDocument::Element> elements = element.getElementsByTagName("div");
Get all elements which have a certain class. Return a std::vector<HTMLDocument::Element>
that contains all matching elements.
using namespace html_parser;
HTMLDocument document("<div id='wrapper'><div class='my-class'>a ≤ b</div><div class='my-class'>qwq</div></div>");
HTMLDocument::Element element = document.getElementById("wrapper");
// std::vector<HTMLDocument::Element> HTMLDocument::Element::getElementsByClassName(const std::string &)
std::vector<HTMLDocument::Element> elements = element.getElementsByClassName("my-class");
Get all child elements of the element upon which it was called. Return a std::vector<HTMLDocument::Element>
that contains all the child elements.
using namespace html_parser;
HTMLDocument document("<div id='wrapper'><span>First</span><span>Second</span></div>");
HTMLDocument::Element element = document.getElementById("wrapper");
// std::vector<HTMLDocument::Element> HTMLDocument::Element::getChildren()
std::vector<HTMLDocument::Element> elements = element.getChildren();