|  |  |  | libxml2 Reference Manual | 
|---|
HTMLparser - interface for an HTML 4.0 non-verifying parser
this module implements an HTML 4.0 non-verifying parser with API compatible with the XML parser ones. It should be able to parse "real world" HTML, even if severely broken from a specification point of view.
Author(s): Daniel Veillard
#define UTF8ToHtml; #define htmlDefaultSubelement(elt); #define htmlElementAllowedHereDesc(parent, elt); #define htmlRequiredAttrs(elt); typedef xmlDocPtr htmlDocPtr; typedef struct _htmlElemDesc htmlElemDesc; typedef htmlElemDesc * htmlElemDescPtr; typedef struct _htmlEntityDesc htmlEntityDesc; typedef htmlEntityDesc * htmlEntityDescPtr; typedef xmlNodePtr htmlNodePtr; typedef xmlParserCtxt htmlParserCtxt; typedef xmlParserCtxtPtr htmlParserCtxtPtr; typedef xmlParserInput htmlParserInput; typedef xmlParserInputPtr htmlParserInputPtr; typedef xmlParserNodeInfo htmlParserNodeInfo; typedef enum htmlParserOption; typedef xmlSAXHandler htmlSAXHandler; typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; typedef enum htmlStatus; htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy); int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem); htmlParserCtxtPtr htmlCreateFileParserCtxt (const char * filename,
const char * encoding); htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size); htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc); htmlDocPtr htmlCtxtParseDocument (htmlParserCtxtPtr ctxt,
xmlParserInputPtr input); htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * str,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options); htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options); void htmlCtxtReset (htmlParserCtxtPtr ctxt); int htmlCtxtSetOptions (xmlParserCtxtPtr ctxt,
int options); int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options); int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt); htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt); int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar); const htmlEntityDesc * htmlEntityLookup (const xmlChar * name); const htmlEntityDesc * htmlEntityValueLookup (unsigned int value); void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); int htmlHandleOmittedElem (int val); void htmlInitAutoClose (void); int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem); int htmlIsScriptAttribute (const xmlChar * name); htmlParserCtxtPtr htmlNewParserCtxt (void); htmlParserCtxtPtr htmlNewSAXParserCtxt (const htmlSAXHandler * sax,
void * userData); htmlStatus htmlNodeStatus (htmlNodePtr node,
int legacy); int htmlParseCharRef (htmlParserCtxtPtr ctxt); int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate); htmlDocPtr htmlParseDoc (const xmlChar * cur,
const char * encoding); int htmlParseDocument (htmlParserCtxtPtr ctxt); void htmlParseElement (htmlParserCtxtPtr ctxt); const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str); htmlDocPtr htmlParseFile (const char * filename,
const char * encoding); htmlDocPtr htmlReadDoc (const xmlChar * str,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlReadFd (int fd,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options); htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * url,
const char * encoding,
int options); htmlDocPtr htmlSAXParseDoc (const xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData); const htmlElemDesc * htmlTagLookup (const xmlChar * tag); int htmlUTF8ToHtml (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen);
#define UTF8ToHtml;
#define htmlDefaultSubelement(elt);
Returns the default subelement for this element
| elt: | HTML element | 
#define htmlElementAllowedHereDesc(parent, elt);
Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.
| parent: | HTML parent element | 
| elt: | HTML element | 
#define htmlRequiredAttrs(elt);
Returns the attributes required for the specified element.
| elt: | HTML element | 
struct _htmlElemDesc {
    const char *	name
    char	startTag
    char	endTag
    char	saveEndTag
    char	empty
    char	depr
    char	dtd
    char	isinline
    const char *	desc
    const char **	subelts
    const char *	defaultsubelt
    const char **	attrs_opt
    const char **	attrs_depr
    const char **	attrs_req
    int	dataMode
} htmlElemDesc;
htmlElemDesc * htmlElemDescPtr;
struct _htmlEntityDesc {
    unsigned int	value
    const char *	name
    const char *	desc
} htmlEntityDesc;
htmlEntityDesc * htmlEntityDescPtr;
xmlNodePtr htmlNodePtr;
xmlParserCtxt htmlParserCtxt;
xmlParserCtxtPtr htmlParserCtxtPtr;
xmlParserInput htmlParserInput;
xmlParserInputPtr htmlParserInputPtr;
xmlParserNodeInfo htmlParserNodeInfo;
enum htmlParserOption { HTML_PARSE_RECOVER = 1 /* No effect */ HTML_PARSE_HTML5 = 2 /* HTML5 support */ HTML_PARSE_NODEFDTD = 4 /* do not default a doctype if not found */ HTML_PARSE_NOERROR = 32 /* suppress error reports */ HTML_PARSE_NOWARNING = 64 /* suppress warning reports */ HTML_PARSE_PEDANTIC = 128 /* No effect */ HTML_PARSE_NOBLANKS = 256 /* remove blank nodes */ HTML_PARSE_NONET = 2048 /* No effect */ HTML_PARSE_NOIMPLIED = 8192 /* Do not add implied html/body... elements */ HTML_PARSE_COMPACT = 65536 /* compact small text nodes */ HTML_PARSE_HUGE = 524288 /* relax any hardcoded limit from the parser */ HTML_PARSE_IGNORE_ENC = 2097152 /* ignore internal document encoding hint */ HTML_PARSE_BIG_LINES = 4194304 /* Store big lines numbers in text PSVI field */ };
xmlSAXHandler htmlSAXHandler;
xmlSAXHandlerPtr htmlSAXHandlerPtr;
enum htmlStatus { HTML_NA = 0 /* something we don't check at all */ HTML_INVALID = 1 HTML_DEPRECATED = 2 HTML_VALID = 4 HTML_REQUIRED = 12 /* VALID bit set so ( & HTML_VALID ) is TRUE */ };
const xmlSAXHandlerV1 htmlDefaultSAXHandler;
DEPRECATED: This handler is unused and will be removed from future versions. Default old SAX v1 handler for HTML, builds the DOM tree
htmlStatus htmlAttrAllowed (const htmlElemDesc * elt,
const xmlChar * attr,
int legacy)
DEPRECATED: Don't use.
| elt: | HTML element | 
| attr: | HTML attribute | 
| legacy: | whether to allow deprecated attributes | 
| Returns: | HTML_VALID | 
int htmlAutoCloseTag (htmlDocPtr doc,
const xmlChar * name,
htmlNodePtr elem)
DEPRECATED: Internal function, don't use. The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
| doc: | the HTML document | 
| name: | The tag name | 
| elem: | the HTML element | 
| Returns: | 1 if autoclose, 0 otherwise | 
htmlParserCtxtPtr htmlCreateFileParserCtxt (const char * filename,
const char * encoding)
DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile. Create a parser context to read from a file. A non-NULL encoding overrides encoding declarations in the document. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
| filename: | the filename | 
| encoding: | optional encoding | 
| Returns: | the new parser context or NULL if a memory allocation failed. | 
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer,
int size)
DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory. Create a parser context for an HTML in-memory document. The input buffer must not contain any terminating null bytes.
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| Returns: | the new parser context or NULL | 
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,
void * user_data,
const char * chunk,
int size,
const char * filename,
xmlCharEncoding enc)
Create a parser context for using the HTML parser in push mode.
| sax: | a SAX handler (optional) | 
| user_data: | The user data returned on SAX callbacks (optional) | 
| chunk: | a pointer to an array of chars (optional) | 
| size: | number of chars in the array | 
| filename: | only used for error reporting (optional) | 
| enc: | encoding (deprecated, pass XML_CHAR_ENCODING_NONE) | 
| Returns: | the new parser context or NULL if a memory allocation failed. | 
htmlDocPtr htmlCtxtParseDocument (htmlParserCtxtPtr ctxt,
xmlParserInputPtr input)
Parse an HTML document and return the resulting document tree. Available since 2.13.0.
| ctxt: | an HTML parser context | 
| input: | parser input | 
| Returns: | the resulting document tree or NULL | 
htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt,
const xmlChar * str,
const char * URL,
const char * encoding,
int options)
Parse an HTML in-memory document and build a tree. See htmlCtxtUseOptions for details.
| ctxt: | an HTML parser context | 
| str: | a pointer to a zero terminated string | 
| URL: | only used for error reporting (optional) | 
| encoding: | the document encoding (optional) | 
| options: | a combination of htmlParserOptions | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt,
int fd,
const char * URL,
const char * encoding,
int options)
Parse an HTML from a file descriptor and build a tree. See htmlCtxtUseOptions for details. NOTE that the file descriptor will not be closed when the context is freed or reset.
| ctxt: | an HTML parser context | 
| fd: | an open file descriptor | 
| URL: | only used for error reporting (optional) | 
| encoding: | the document encoding (optinal) | 
| options: | a combination of htmlParserOptions | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt,
const char * filename,
const char * encoding,
int options)
Parse an HTML file from the filesystem, the network or a user-defined resource loader. See htmlCtxtUseOptions for details.
| ctxt: | an HTML parser context | 
| filename: | a file or URL | 
| encoding: | the document encoding (optional) | 
| options: | a combination of htmlParserOptions | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt,
xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * URL,
const char * encoding,
int options)
Parse an HTML document from I/O functions and source and build a tree. See htmlCtxtUseOptions for details.
| ctxt: | an HTML parser context | 
| ioread: | an I/O read function | 
| ioclose: | an I/O close function | 
| ioctx: | an I/O handler | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt,
const char * buffer,
int size,
const char * URL,
const char * encoding,
int options)
Parse an HTML in-memory document and build a tree. The input buffer must not contain any terminating null bytes. See htmlCtxtUseOptions for details.
| ctxt: | an HTML parser context | 
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| URL: | only used for error reporting (optional) | 
| encoding: | the document encoding (optinal) | 
| options: | a combination of htmlParserOptions | 
| Returns: | the resulting document tree | 
void htmlCtxtReset (htmlParserCtxtPtr ctxt)
Reset a parser context
| ctxt: | an HTML parser context | 
int htmlCtxtSetOptions (xmlParserCtxtPtr ctxt,
int options)
Applies the options to the parser context. Unset options are cleared. Available since 2.14.0. With older versions, you can use htmlCtxtUseOptions. HTML_PARSE_RECOVER No effect as of 2.14.0. HTML_PARSE_HTML5 Make the tokenizer emit a SAX callback for each token. This results in unbalanced invocations of startElement and endElement. For now, this is only usable with custom SAX callbacks. HTML_PARSE_NODEFDTD Do not default to a doctype if none was found. HTML_PARSE_NOERROR Disable error and warning reports to the error handlers. Errors are still accessible with xmlCtxtGetLastError. HTML_PARSE_NOWARNING Disable warning reports. HTML_PARSE_PEDANTIC No effect. HTML_PARSE_NOBLANKS Remove some text nodes containing only whitespace from the result document. Which nodes are removed depends on a conservative heuristic. The reindenting feature of the serialization code relies on this option to be set when parsing. Use of this option is DISCOURAGED. HTML_PARSE_NONET No effect. HTML_PARSE_NOIMPLIED Do not add implied html, head or body elements. HTML_PARSE_COMPACT Store small strings directly in the node struct to save memory. HTML_PARSE_HUGE Relax some internal limits. Available since 2.14.0. Use XML_PARSE_HUGE works with older versions. Maximum size of text nodes, tags, comments, CDATA sections normal: 10M huge: 1B Maximum size of names, system literals, pubid literals normal: 50K huge: 10M Maximum nesting depth of elements normal: 256 huge: 2048 HTML_PARSE_IGNORE_ENC Ignore the encoding in the HTML declaration. This option is mostly unneeded these days. The only effect is to enforce UTF-8 decoding of ASCII-like data. HTML_PARSE_BIG_LINES Enable reporting of line numbers larger than 65535. Available since 2.14.0.
| ctxt: | an HTML parser context | 
| options: | a bitmask of xmlParserOption values | 
| Returns: | 0 in case of success, the set of unknown or unimplemented options in case of error. | 
int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
int options)
DEPRECATED: Use htmlCtxtSetOptions. Applies the options to the parser context. The following options are never cleared and can only be enabled: HTML_PARSE_NODEFDTD HTML_PARSE_NOERROR HTML_PARSE_NOWARNING HTML_PARSE_NOIMPLIED HTML_PARSE_COMPACT HTML_PARSE_HUGE HTML_PARSE_IGNORE_ENC HTML_PARSE_BIG_LINES
| ctxt: | an HTML parser context | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | 0 in case of success, the set of unknown or unimplemented options in case of error. | 
int htmlElementAllowedHere (const htmlElemDesc * parent,
const xmlChar * elt)
DEPRECATED: Don't use.
| parent: | HTML parent element | 
| elt: | HTML element | 
| Returns: | 1 | 
htmlStatus htmlElementStatusHere (const htmlElemDesc * parent,
const htmlElemDesc * elt)
DEPRECATED: Don't use.
| parent: | HTML parent element | 
| elt: | HTML element | 
| Returns: | HTML_VALID | 
int htmlEncodeEntities (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen,
int quoteChar)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out: | a pointer to an array of bytes to store the result | 
| outlen: | the length of @out | 
| in: | a pointer to an array of UTF-8 chars | 
| inlen: | the length of @in | 
| quoteChar: | the quote character to escape (' or ") or zero. | 
| Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. | 
const htmlEntityDesc * htmlEntityLookup (const xmlChar * name)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
| name: | the entity name | 
| Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. | 
const htmlEntityDesc * htmlEntityValueLookup (unsigned int value)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
| value: | the entity's unicode value | 
| Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. | 
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt)
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
| ctxt: | an HTML parser context | 
int htmlHandleOmittedElem (int val)
DEPRECATED: Use HTML_PARSE_NOIMPLIED Set and return the previous value for handling HTML omitted tags.
| val: | int 0 or 1 | 
| Returns: | the last value for 0 for no handling, 1 for auto insertion. | 
int htmlIsAutoClosed (htmlDocPtr doc,
htmlNodePtr elem)
DEPRECATED: Internal function, don't use. The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
| doc: | the HTML document | 
| elem: | the HTML element | 
| Returns: | 1 if autoclosed, 0 otherwise | 
int htmlIsScriptAttribute (const xmlChar * name)
Check if an attribute is of content type Script
| name: | an attribute name | 
| Returns: | 1 is the attribute is a script 0 otherwise | 
htmlParserCtxtPtr htmlNewParserCtxt (void)
Allocate and initialize a new HTML parser context. This can be used to parse HTML documents into DOM trees with functions like xmlCtxtReadFile or xmlCtxtReadMemory. See htmlCtxtUseOptions for parser options. See xmlCtxtSetErrorHandler for advanced error handling. See htmlNewSAXParserCtxt for custom SAX parsers.
| Returns: | the htmlParserCtxtPtr or NULL in case of allocation error | 
htmlParserCtxtPtr htmlNewSAXParserCtxt (const htmlSAXHandler * sax,
void * userData)
Allocate and initialize a new HTML SAX parser context. If userData is NULL, the parser context will be passed as user data. Available since 2.11.0. If you want support older versions, it's best to invoke htmlNewParserCtxt and set ctxt->sax with struct assignment. Also see htmlNewParserCtxt.
| sax: | SAX handler | 
| userData: | user data | 
| Returns: | the htmlParserCtxtPtr or NULL in case of allocation error | 
htmlStatus htmlNodeStatus (htmlNodePtr node,
int legacy)
DEPRECATED: Don't use.
| node: | an htmlNodePtr in a tree | 
| legacy: | whether to allow deprecated elements (YES is faster here for Element nodes) | 
| Returns: | HTML_VALID | 
int htmlParseCharRef (htmlParserCtxtPtr ctxt)
DEPRECATED: Internal function, don't use.
| ctxt: | an HTML parser context | 
| Returns: | 0 | 
int htmlParseChunk (htmlParserCtxtPtr ctxt,
const char * chunk,
int size,
int terminate)
Parse a chunk of memory in push parser mode. Assumes that the parser context was initialized with htmlCreatePushParserCtxt. The last chunk, which will often be empty, must be marked with the @terminate flag. With the default SAX callbacks, the resulting document will be available in ctxt->myDoc. This pointer will not be freed by the library. If the document isn't well-formed, ctxt->myDoc is set to NULL.
| ctxt: | an HTML parser context | 
| chunk: | chunk of memory | 
| size: | size of chunk in bytes | 
| terminate: | last chunk indicator | 
| Returns: | an xmlParserErrors code (0 on success). | 
htmlDocPtr htmlParseDoc (const xmlChar * cur,
const char * encoding)
DEPRECATED: Use htmlReadDoc. Parse an HTML in-memory document and build a tree. This function uses deprecated global parser options.
| cur: | a pointer to an array of xmlChar | 
| encoding: | the encoding (optional) | 
| Returns: | the resulting document tree | 
int htmlParseDocument (htmlParserCtxtPtr ctxt)
Parse an HTML document and invoke the SAX handlers. This is useful if you're only interested in custom SAX callbacks. If you want a document tree, use htmlCtxtParseDocument.
| ctxt: | an HTML parser context | 
| Returns: | 0, -1 in case of error. | 
void htmlParseElement (htmlParserCtxtPtr ctxt)
DEPRECATED: Internal function, don't use. parse an HTML element, this is highly recursive this is kept for compatibility with previous code versions [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue
| ctxt: | an HTML parser context | 
const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt,
const xmlChar ** str)
DEPRECATED: Internal function, don't use.
| ctxt: | an HTML parser context | 
| str: | location to store the entity name | 
| Returns: | NULL. | 
htmlDocPtr htmlParseFile (const char * filename,
const char * encoding)
Parse an HTML file and build a tree.
| filename: | the filename | 
| encoding: | encoding (optional) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadDoc (const xmlChar * str,
const char * url,
const char * encoding,
int options)
Convenience function to parse an HTML document from a zero-terminated string. See htmlCtxtReadDoc for details.
| str: | a pointer to a zero terminated string | 
| url: | only used for error reporting (optoinal) | 
| encoding: | the document encoding (optional) | 
| options: | a combination of htmlParserOptions | 
| Returns: | the resulting document tree. | 
htmlDocPtr htmlReadFd (int fd,
const char * url,
const char * encoding,
int options)
Convenience function to parse an HTML document from a file descriptor. NOTE that the file descriptor will not be closed when the context is freed or reset. See htmlCtxtReadFd for details.
| fd: | an open file descriptor | 
| url: | only used for error reporting (optional) | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOptions | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadFile (const char * filename,
const char * encoding,
int options)
Convenience function to parse an HTML file from the filesystem, the network or a global user-defined resource loader. See htmlCtxtReadFile for details.
| filename: | a file or URL | 
| encoding: | the document encoding (optional) | 
| options: | a combination of htmlParserOptions | 
| Returns: | the resulting document tree. | 
htmlDocPtr htmlReadIO (xmlInputReadCallback ioread,
xmlInputCloseCallback ioclose,
void * ioctx,
const char * url,
const char * encoding,
int options)
Convenience function to parse an HTML document from I/O functions and context. See htmlCtxtReadIO for details.
| ioread: | an I/O read function | 
| ioclose: | an I/O close function (optional) | 
| ioctx: | an I/O handler | 
| url: | only used for error reporting (optional) | 
| encoding: | the document encoding (optional) | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlReadMemory (const char * buffer,
int size,
const char * url,
const char * encoding,
int options)
Convenience function to parse an HTML document from memory. The input buffer must not contain any terminating null bytes. See htmlCtxtReadMemory for details.
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| url: | only used for error reporting (optional) | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns: | the resulting document tree | 
htmlDocPtr htmlSAXParseDoc (const xmlChar * cur,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc. Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
| cur: | a pointer to an array of xmlChar | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| sax: | the SAX handler block | 
| userData: | if using SAX, this pointer will be provided on callbacks. | 
| Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. | 
htmlDocPtr htmlSAXParseFile (const char * filename,
const char * encoding,
htmlSAXHandlerPtr sax,
void * userData)
DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile. parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
| filename: | the filename | 
| encoding: | encoding (optional) | 
| sax: | the SAX handler block | 
| userData: | if using SAX, this pointer will be provided on callbacks. | 
| Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. | 
const htmlElemDesc * htmlTagLookup (const xmlChar * tag)
Lookup the HTML tag in the ElementTable
| tag: | The tag name in lowercase | 
| Returns: | the related htmlElemDescPtr or NULL if not found. | 
int htmlUTF8ToHtml (unsigned char * out,
int * outlen,
const unsigned char * in,
int * inlen)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out: | a pointer to an array of bytes to store the result | 
| outlen: | the length of @out | 
| in: | a pointer to an array of UTF-8 chars | 
| inlen: | the length of @in | 
| Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |