this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view. Table of Contents
#define htmlDefaultSubelement #define htmlElementAllowedHereDesc #define htmlRequiredAttrs Typedef xmlDocPtr htmlDocPtr
Structure htmlElemDesc struct _htmlElemDesc
Typedef htmlElemDesc * htmlElemDescPtr
Structure htmlEntityDesc struct _htmlEntityDesc
Typedef htmlEntityDesc * htmlEntityDescPtr
Typedef xmlNodePtr htmlNodePtr
Typedef xmlParserCtxt htmlParserCtxt
Typedef xmlParserCtxtPtr htmlParserCtxtPtr
Typedef xmlParserInput htmlParserInput
Typedef xmlParserInputPtr htmlParserInputPtr
Typedef xmlParserNodeInfo htmlParserNodeInfo
Enum htmlParserOption
Typedef xmlSAXHandler htmlSAXHandler
Typedef xmlSAXHandlerPtr htmlSAXHandlerPtr
Enum htmlStatus
int UTF8ToHtml (unsigned char * out, int * outlen, const unsigned char * in, int * inlen)
htmlStatus htmlAttrAllowed (const htmlElemDesc * elt, const xmlChar * attr, int legacy)
int htmlAutoCloseTag (htmlDocPtr doc, const xmlChar * name, htmlNodePtr elem)
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer, int size)
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void * user_data, const char * chunk, int size, const char * filename, xmlCharEncoding enc)
htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt, const xmlChar * cur, const char * URL, const char * encoding, int options)
htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt, int fd, const char * URL, const char * encoding, int options)
htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt, const char * filename, const char * encoding, int options)
htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt, const char * buffer, int size, const char * URL, const char * encoding, int options)
void htmlCtxtReset (htmlParserCtxtPtr ctxt)
int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options)
int htmlElementAllowedHere (const htmlElemDesc * parent, const xmlChar * elt)
htmlStatus htmlElementStatusHere (const htmlElemDesc * parent, const htmlElemDesc * elt)
int htmlEncodeEntities (unsigned char * out, int * outlen, const unsigned char * in, int * inlen, int quoteChar)
const htmlEntityDesc * htmlEntityLookup (const xmlChar * name)
const htmlEntityDesc * htmlEntityValueLookup (unsigned int value)
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt)
int htmlHandleOmittedElem (int val)
int htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem)
int htmlIsScriptAttribute (const xmlChar * name)
htmlParserCtxtPtr htmlNewParserCtxt (void)
htmlStatus htmlNodeStatus (const htmlNodePtr node, int legacy)
int htmlParseCharRef (htmlParserCtxtPtr ctxt)
int htmlParseChunk (htmlParserCtxtPtr ctxt, const char * chunk, int size, int terminate)
htmlDocPtr htmlParseDoc (xmlChar * cur, const char * encoding)
int htmlParseDocument (htmlParserCtxtPtr ctxt)
void htmlParseElement (htmlParserCtxtPtr ctxt)
const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt, const xmlChar ** str)
htmlDocPtr htmlParseFile (const char * filename, const char * encoding)
htmlDocPtr htmlReadDoc (const xmlChar * cur, const char * URL, const char * encoding, int options)
htmlDocPtr htmlReadFd (int fd, const char * URL, const char * encoding, int options)
htmlDocPtr htmlReadFile (const char * filename, const char * encoding, int options)
htmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
htmlDocPtr htmlReadMemory (const char * buffer, int size, const char * URL, const char * encoding, int options)
htmlDocPtr htmlSAXParseDoc (xmlChar * cur, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
htmlDocPtr htmlSAXParseFile (const char * filename, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
const htmlElemDesc * htmlTagLookup (const xmlChar * tag)
Description
Macro: htmlDefaultSubelement#define htmlDefaultSubelement Returns the default subelement for this element
Macro: htmlElementAllowedHereDesc#define htmlElementAllowedHereDesc Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.
Macro: htmlRequiredAttrs#define htmlRequiredAttrs Returns the attributes required for the specified element.
Structure htmlElemDesc struct _htmlElemDesc {
const char * name : The tag name
char startTag : Whether the start tag can be implied
char endTag : Whether the end tag can be implied
char saveEndTag : Whether the end tag should be saved
char empty : Is this an empty element ?
char depr : Is this a deprecated element ?
char dtd : 1: only in Loose DTD, 2: only Frameset
char isinline : is this a block 0 or inline 1 element
const char * desc : the description NRK Jan.2003 * New fiel
const char ** subelts : allowed sub-elements of this element
const char * defaultsubelt : subelement for suggested auto-repair if
const char ** attrs_opt : Optional Attributes
const char ** attrs_depr : Additional deprecated attributes
const char ** attrs_req : Required attributes
} Structure htmlEntityDesc struct _htmlEntityDesc {
unsigned int value : the UNICODE value for the character
const char * name : The entity name
const char * desc : the description
} Enum htmlParserOption {
HTML_PARSE_RECOVER = 1 : Relaxed parsing
HTML_PARSE_NODEFDTD = 4 : do not default a doctype if not found
HTML_PARSE_NOERROR = 32 : suppress error reports
HTML_PARSE_NOWARNING = 64 : suppress warning reports
HTML_PARSE_PEDANTIC = 128 : pedantic error reporting
HTML_PARSE_NOBLANKS = 256 : remove blank nodes
HTML_PARSE_NONET = 2048 : Forbid network access
HTML_PARSE_NOIMPLIED = 8192 : Do not add implied html/body... elements
HTML_PARSE_COMPACT = 65536 : compact small text nodes
HTML_PARSE_IGNORE_ENC = 2097152 : ignore internal document encoding hint
}
Enum htmlStatus {
HTML_NA = 0 : something we don't check at all
HTML_INVALID = 1
HTML_DEPRECATED = 2
HTML_VALID = 4
HTML_REQUIRED = 12 : VALID bit set so ( & HTML_VALID ) is TRUE
}
Function: UTF8ToHtmlint UTF8ToHtml (unsigned char * out, int * outlen, const unsigned char * in, int * inlen)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out: | a pointer to an array of bytes to store the result | outlen: | the length of @out | in: | a pointer to an array of UTF-8 chars | inlen: | the length of @in | Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
Function: htmlAttrAllowedhtmlStatus htmlAttrAllowed (const htmlElemDesc * elt, const xmlChar * attr, int legacy)
Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes
elt: | HTML element | attr: | HTML attribute | legacy: | whether to allow deprecated attributes | Returns: | one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
Function: htmlAutoCloseTagint htmlAutoCloseTag (htmlDocPtr doc, const xmlChar * name, htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
doc: | the HTML document | name: | The tag name | elem: | the HTML element | Returns: | 1 if autoclose, 0 otherwise |
Function: htmlCreateMemoryParserCtxthtmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer, int size)
Create a parser context for an HTML in-memory document.
buffer: | a pointer to a char array | size: | the size of the array | Returns: | the new parser context or NULL |
Function: htmlCreatePushParserCtxthtmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void * user_data, const char * chunk, int size, const char * filename, xmlCharEncoding enc)
Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.
sax: | a SAX handler | user_data: | The user data returned on SAX callbacks | chunk: | a pointer to an array of chars | size: | number of chars in the array | filename: | an optional file name or URI | enc: | an optional encoding | Returns: | the new parser context or NULL |
Function: htmlCtxtReadDochtmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt, const xmlChar * cur, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | cur: | a pointer to a zero terminated string | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCtxtReadFdhtmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt, int fd, const char * URL, const char * encoding, int options)
parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | fd: | an open file descriptor | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCtxtReadFilehtmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt, const char * filename, const char * encoding, int options)
parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | filename: | a file or URL | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCtxtReadIOhtmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | ioread: | an I/O read function | ioclose: | an I/O close function | ioctx: | an I/O handler | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCtxtReadMemoryhtmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt, const char * buffer, int size, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | buffer: | a pointer to a char array | size: | the size of the array | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCtxtResetvoid htmlCtxtReset (htmlParserCtxtPtr ctxt)
Reset a parser context
ctxt: | an HTML parser context |
Function: htmlCtxtUseOptionsint htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options)
Applies the options to the parser context
ctxt: | an HTML parser context | options: | a combination of htmlParserOption(s) | Returns: | 0 in case of success, the set of unknown or unimplemented options in case of error. |
Function: htmlElementAllowedHereint htmlElementAllowedHere (const htmlElemDesc * parent, const xmlChar * elt)
Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements
parent: | HTML parent element | elt: | HTML element | Returns: | 1 if allowed; 0 otherwise. |
Function: htmlElementStatusHerehtmlStatus htmlElementStatusHere (const htmlElemDesc * parent, const htmlElemDesc * elt)
Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.
parent: | HTML parent element | elt: | HTML element | Returns: | one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
Function: htmlEncodeEntitiesint htmlEncodeEntities (unsigned char * out, int * outlen, const unsigned char * in, int * inlen, int quoteChar)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out: | a pointer to an array of bytes to store the result | outlen: | the length of @out | in: | a pointer to an array of UTF-8 chars | inlen: | the length of @in | quoteChar: | the quote character to escape (' or ") or zero. | Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
Function: htmlEntityLookupconst htmlEntityDesc * htmlEntityLookup (const xmlChar * name)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
name: | the entity name | Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. |
Function: htmlEntityValueLookupconst htmlEntityDesc * htmlEntityValueLookup (unsigned int value)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
value: | the entity's unicode value | Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. |
Function: htmlFreeParserCtxtvoid htmlFreeParserCtxt (htmlParserCtxtPtr ctxt)
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
ctxt: | an HTML parser context |
Function: htmlHandleOmittedElemint htmlHandleOmittedElem (int val)
Set and return the previous value for handling HTML omitted tags.
val: | int 0 or 1 | Returns: | the last value for 0 for no handling, 1 for auto insertion. |
Function: htmlIsAutoClosedint htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
doc: | the HTML document | elem: | the HTML element | Returns: | 1 if autoclosed, 0 otherwise |
Function: htmlIsScriptAttributeint htmlIsScriptAttribute (const xmlChar * name)
Check if an attribute is of content type Script
Function: htmlNewParserCtxthtmlParserCtxtPtr htmlNewParserCtxt (void)
Allocate and initialize a new parser context.
Function: htmlNodeStatushtmlStatus htmlNodeStatus (const htmlNodePtr node, int legacy)
Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)
Function: htmlParseCharRefint htmlParseCharRef (htmlParserCtxtPtr ctxt)
parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
ctxt: | an HTML parser context | Returns: | the value parsed (as an int) |
Function: htmlParseChunkint htmlParseChunk (htmlParserCtxtPtr ctxt, const char * chunk, int size, int terminate)
Parse a Chunk of memory
ctxt: | an HTML parser context | chunk: | an char array | size: | the size in byte of the chunk | terminate: | last chunk indicator | Returns: | zero if no error, the xmlParserErrors otherwise. |
Function: htmlParseDochtmlDocPtr htmlParseDoc (xmlChar * cur, const char * encoding)
parse an HTML in-memory document and build a tree.
cur: | a pointer to an array of xmlChar | encoding: | a free form C string describing the HTML document encoding, or NULL | Returns: | the resulting document tree |
Function: htmlParseDocumentint htmlParseDocument (htmlParserCtxtPtr ctxt)
parse an HTML document (and build a tree if using the standard SAX interface).
ctxt: | an HTML parser context | Returns: | 0, -1 in case of error. the parser context is augmented as a result of the parsing. |
Function: htmlParseElementvoid htmlParseElement (htmlParserCtxtPtr ctxt)
parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue
ctxt: | an HTML parser context |
Function: htmlParseEntityRefconst htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt, const xmlChar ** str)
parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'
ctxt: | an HTML parser context | str: | location to store the entity name | Returns: | the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller. |
Function: htmlParseFilehtmlDocPtr htmlParseFile (const char * filename, const char * encoding)
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
filename: | the filename | encoding: | a free form C string describing the HTML document encoding, or NULL | Returns: | the resulting document tree |
Function: htmlReadDochtmlDocPtr htmlReadDoc (const xmlChar * cur, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree.
cur: | a pointer to a zero terminated string | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlReadFdhtmlDocPtr htmlReadFd (int fd, const char * URL, const char * encoding, int options)
parse an XML from a file descriptor and build a tree.
fd: | an open file descriptor | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlReadFilehtmlDocPtr htmlReadFile (const char * filename, const char * encoding, int options)
parse an XML file from the filesystem or the network.
filename: | a file or URL | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlReadIOhtmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
parse an HTML document from I/O functions and source and build a tree.
ioread: | an I/O read function | ioclose: | an I/O close function | ioctx: | an I/O handler | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlReadMemoryhtmlDocPtr htmlReadMemory (const char * buffer, int size, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree.
buffer: | a pointer to a char array | size: | the size of the array | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlSAXParseDochtmlDocPtr htmlSAXParseDoc (xmlChar * cur, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
cur: | a pointer to an array of xmlChar | encoding: | a free form C string describing the HTML document encoding, or NULL | sax: | the SAX handler block | userData: | if using SAX, this pointer will be provided on callbacks. | Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. |
Function: htmlSAXParseFilehtmlDocPtr htmlSAXParseFile (const char * filename, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
filename: | the filename | encoding: | a free form C string describing the HTML document encoding, or NULL | sax: | the SAX handler block | userData: | if using SAX, this pointer will be provided on callbacks. | Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. |
Function: htmlTagLookupconst htmlElemDesc * htmlTagLookup (const xmlChar * tag)
Lookup the HTML tag in the ElementTable
tag: | The tag name in lowercase | Returns: | the related htmlElemDescPtr or NULL if not found. |
Daniel Veillard |