Cogs.Core
Classes | Typedefs | Enumerations | Functions | Variables
gumbo.h File Reference
#include <stdbool.h>
#include <stddef.h>
#include "tag_enum.h"

Go to the source code of this file.

Classes

struct  GumboSourcePosition
 
struct  GumboStringPiece
 
struct  GumboVector
 
struct  GumboAttribute
 
struct  GumboDocument
 
struct  GumboText
 
struct  GumboElement
 
struct  GumboInternalNode
 
struct  GumboInternalOptions
 
struct  GumboInternalOutput
 

Typedefs

typedef struct GumboInternalNode GumboNode
 
typedef void *(* GumboAllocatorFunction) (void *userdata, size_t size)
 
typedef void(* GumboDeallocatorFunction) (void *userdata, void *ptr)
 
typedef struct GumboInternalOptions GumboOptions
 
typedef struct GumboInternalOutput GumboOutput
 

Enumerations

enum  GumboTag { GUMBO_TAG_UNKNOWN , GUMBO_TAG_LAST }
 
enum  GumboAttributeNamespaceEnum { GUMBO_ATTR_NAMESPACE_NONE , GUMBO_ATTR_NAMESPACE_XLINK , GUMBO_ATTR_NAMESPACE_XML , GUMBO_ATTR_NAMESPACE_XMLNS }
 
enum  GumboNodeType {
  GUMBO_NODE_DOCUMENT , GUMBO_NODE_ELEMENT , GUMBO_NODE_TEXT , GUMBO_NODE_CDATA ,
  GUMBO_NODE_COMMENT , GUMBO_NODE_WHITESPACE , GUMBO_NODE_TEMPLATE
}
 
enum  GumboQuirksModeEnum { GUMBO_DOCTYPE_NO_QUIRKS , GUMBO_DOCTYPE_QUIRKS , GUMBO_DOCTYPE_LIMITED_QUIRKS }
 
enum  GumboNamespaceEnum { GUMBO_NAMESPACE_HTML , GUMBO_NAMESPACE_SVG , GUMBO_NAMESPACE_MATHML }
 
enum  GumboParseFlags {
  GUMBO_INSERTION_NORMAL = 0 , GUMBO_INSERTION_BY_PARSER = 1 << 0 , GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1 , GUMBO_INSERTION_IMPLIED = 1 << 3 ,
  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4 , GUMBO_INSERTION_FROM_ISINDEX = 1 << 5 , GUMBO_INSERTION_FROM_IMAGE = 1 << 6 , GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7 ,
  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8 , GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9 , GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10
}
 

Functions

bool gumbo_string_equals (const GumboStringPiece *str1, const GumboStringPiece *str2)
 
bool gumbo_string_equals_ignore_case (const GumboStringPiece *str1, const GumboStringPiece *str2)
 
int gumbo_vector_index_of (GumboVector *vector, const void *element)
 
const char * gumbo_normalized_tagname (GumboTag tag)
 
void gumbo_tag_from_original_text (GumboStringPiece *text)
 
const char * gumbo_normalize_svg_tagname (const GumboStringPiece *tagname)
 
GumboTag gumbo_tag_enum (const char *tagname)
 
GumboTag gumbo_tagn_enum (const char *tagname, unsigned int length)
 
GumboAttributegumbo_get_attribute (const GumboVector *attrs, const char *name)
 
GumboOutputgumbo_parse (const char *buffer)
 
GumboOutputgumbo_parse_with_options (const GumboOptions *options, const char *buffer, size_t buffer_length)
 
void gumbo_destroy_output (const GumboOptions *options, GumboOutput *output)
 

Variables

const GumboSourcePosition kGumboEmptySourcePosition
 
const GumboStringPiece kGumboEmptyString
 
const GumboVector kGumboEmptyVector
 
const GumboOptions kGumboDefaultOptions
 

Typedef Documentation

◆ GumboAllocatorFunction

typedef void *(* GumboAllocatorFunction) (void *userdata, size_t size)

The type for an allocator function. Takes the 'userdata' member of the GumboParser struct as its first argument. Semantics should be the same as malloc, i.e. return a block of size_t bytes on success or NULL on failure. Allocating a block of 0 bytes behaves as per malloc.

Definition at line 551 of file gumbo.h.

◆ GumboDeallocatorFunction

typedef void(* GumboDeallocatorFunction) (void *userdata, void *ptr)

The type for a deallocator function. Takes the 'userdata' member of the GumboParser struct as its first argument.

Definition at line 557 of file gumbo.h.

◆ GumboNode

typedef struct GumboInternalNode GumboNode

Forward declaration of GumboNode so it can be used recursively in GumboNode.parent.

Definition at line 318 of file gumbo.h.

◆ GumboOptions

Input struct containing configuration options for the parser. These let you specify alternate memory managers, provide different error handling, etc. Use kGumboDefaultOptions for sensible defaults, and only set what you need.

◆ GumboOutput

The output struct containing the results of the parse.

Enumeration Type Documentation

◆ GumboAttributeNamespaceEnum

Attribute namespaces. HTML includes special handling for XLink, XML, and XMLNS namespaces on attributes. Everything else goes in the generic "NONE" namespace.

Definition at line 214 of file gumbo.h.

◆ GumboNamespaceEnum

Namespaces. Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. Rather, anything inside an <svg> tag is in the SVG namespace, anything inside the <math> tag is in the MathML namespace, and anything else is inside the HTML namespace. No other namespaces are supported, so this can be an enum only.

Definition at line 336 of file gumbo.h.

◆ GumboNodeType

Enum denoting the type of node. This determines the type of the node.v union.

Enumerator
GUMBO_NODE_DOCUMENT 

Document node. v will be a GumboDocument.

GUMBO_NODE_ELEMENT 

Element node. v will be a GumboElement.

GUMBO_NODE_TEXT 

Text node. v will be a GumboText.

GUMBO_NODE_CDATA 

CDATA node. v will be a GumboText.

GUMBO_NODE_COMMENT 

Comment node. v will be a GumboText, excluding comment delimiters.

GUMBO_NODE_WHITESPACE 

Text node, where all contents is whitespace. v will be a GumboText.

GUMBO_NODE_TEMPLATE 

Template node. This is separate from GUMBO_NODE_ELEMENT because many client libraries will want to ignore the contents of template nodes, as the spec suggests. Recursing on GUMBO_NODE_ELEMENT will do the right thing here, while clients that want to include template contents should also check for GUMBO_NODE_TEMPLATE. v will be a GumboElement.

Definition at line 293 of file gumbo.h.

◆ GumboParseFlags

Parse flags. We track the reasons for parser insertion of nodes and store them in a bitvector in the node itself. This lets client code optimize out nodes that are implied by the HTML structure of the document, or flag constructs that may not be allowed by a style guide, or track the prevalence of incorrect or tricky HTML code.

Enumerator
GUMBO_INSERTION_NORMAL 

A normal node - both start and end tags appear in the source, nothing has been reparented.

GUMBO_INSERTION_BY_PARSER 

A node inserted by the parser to fulfill some implicit insertion rule. This is usually set in addition to some other flag giving a more specific insertion reason; it's a generic catch-all term meaning "The start tag for this node did not appear in the document source".

GUMBO_INSERTION_IMPLICIT_END_TAG 

A flag indicating that the end tag for this node did not appear in the document source. Note that in some cases, you can still have parser-inserted nodes with an explicit end tag: for example, "Text</html>" has GUMBO_INSERTED_BY_PARSER set on the <html> node, but GUMBO_INSERTED_END_TAG_IMPLICITLY is unset, as the </html> tag actually exists. This flag will be set only if the end tag is completely missing; in some cases, the end tag may be misplaced (eg. a </body> tag with text afterwards), which will leave this flag unset and require clients to inspect the parse errors for that case.

GUMBO_INSERTION_IMPLIED 

A flag for nodes that are inserted because their presence is implied by other tags, eg. <html>, <head>, <body>, <tbody>, etc.

GUMBO_INSERTION_CONVERTED_FROM_END_TAG 

A flag for nodes that are converted from their end tag equivalents. For example,

when no paragraph is open implies that the parser should create a

tag and immediately close it, while means the same thing as
.

GUMBO_INSERTION_FROM_ISINDEX 

A flag for nodes that are converted from the parse of an <isindex> tag.

GUMBO_INSERTION_FROM_IMAGE 

A flag for <image> tags that are rewritten as .

GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT 

A flag for nodes that are cloned as a result of the reconstruction of active formatting elements. This is set only on the clone; the initial portion of the formatting run is a NORMAL node with an IMPLICIT_END_TAG.

GUMBO_INSERTION_ADOPTION_AGENCY_CLONED 

A flag for nodes that are cloned by the adoption agency algorithm.

GUMBO_INSERTION_ADOPTION_AGENCY_MOVED 

A flag for nodes that are moved by the adoption agency algorithm.

GUMBO_INSERTION_FOSTER_PARENTED 

A flag for nodes that have been foster-parented out of a table (or should've been foster-parented, if verbatim mode is set).

Definition at line 350 of file gumbo.h.

◆ GumboQuirksModeEnum

◆ GumboTag

enum GumboTag

An enum for all the tags defined in the HTML5 standard. These correspond to the tag names themselves. Enum constants exist only for tags which appear in the spec itself (or for tags with special handling in the SVG and MathML namespaces); any other tags appear as GUMBO_TAG_UNKNOWN and the actual tag name can be obtained through original_tag.

This is mostly for API convenience, so that clients of this library don't need to perform a strcasecmp to find the normalized tag name. It also has efficiency benefits, by letting the parser work with enums instead of strings.

Definition at line 158 of file gumbo.h.

Function Documentation

◆ gumbo_destroy_output()

void gumbo_destroy_output ( const GumboOptions options,
GumboOutput output 
)

Release the memory used for the parse tree & parse errors.

◆ gumbo_get_attribute()

GumboAttribute * gumbo_get_attribute ( const GumboVector attrs,
const char *  name 
)

Given a vector of GumboAttributes, look up the one with the specified name and return it, or NULL if no such attribute exists. This uses a case-insensitive match, as HTML is case-insensitive.

◆ gumbo_normalize_svg_tagname()

const char * gumbo_normalize_svg_tagname ( const GumboStringPiece tagname)

Fixes the case of SVG elements that are not all lowercase. http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-inforeign This is not done at parse time because there's no place to store a mutated tag name. tag_name is an enum (which will be TAG_UNKNOWN for most SVG tags without special handling), while original_tag_name is a pointer into the original buffer. Instead, we provide this helper function that clients can use to rename SVG tags as appropriate. Returns the case-normalized SVG tagname if a replacement is found, or NULL if no normalization is called for. The return value is static data and owned by the library.

◆ gumbo_normalized_tagname()

const char * gumbo_normalized_tagname ( GumboTag  tag)

Returns the normalized (usually all-lowercased, except for foreign content) tag name for an GumboTag enum. Return value is static data owned by the library.

◆ gumbo_parse()

GumboOutput * gumbo_parse ( const char *  buffer)

Parses a buffer of UTF8 text into an GumboNode parse tree. The buffer must live at least as long as the parse tree, as some fields (eg. original_text) point directly into the original buffer.

This doesn't support buffers longer than 4 gigabytes.

◆ gumbo_parse_with_options()

GumboOutput * gumbo_parse_with_options ( const GumboOptions options,
const char *  buffer,
size_t  buffer_length 
)

Extended version of gumbo_parse that takes an explicit options structure, buffer, and length.

◆ gumbo_string_equals()

bool gumbo_string_equals ( const GumboStringPiece str1,
const GumboStringPiece str2 
)

Compares two GumboStringPieces, and returns true if they're equal or false otherwise.

◆ gumbo_string_equals_ignore_case()

bool gumbo_string_equals_ignore_case ( const GumboStringPiece str1,
const GumboStringPiece str2 
)

Compares two GumboStringPieces ignoring case, and returns true if they're equal or false otherwise.

◆ gumbo_tag_enum()

GumboTag gumbo_tag_enum ( const char *  tagname)

Converts a tag name string (which may be in upper or mixed case) to a tag enum. The tag version expects tagname to be NULL-terminated

◆ gumbo_tag_from_original_text()

void gumbo_tag_from_original_text ( GumboStringPiece text)

Extracts the tag name from the original_text field of an element or token by stripping off </> characters and attributes and adjusting the passed-in GumboStringPiece appropriately. The tag name is in the original case and shares a buffer with the original text, to simplify memory management. Behavior is undefined if a string-piece that doesn't represent an HTML tag (<tagname> or </tagname>) is passed in. If the string piece is completely empty (NULL data pointer), then this function will exit successfully as a no-op.

◆ gumbo_vector_index_of()

int gumbo_vector_index_of ( GumboVector vector,
const void *  element 
)

Returns the first index at which an element appears in this vector (testing by pointer equality), or -1 if it never does.

Variable Documentation

◆ kGumboDefaultOptions

const GumboOptions kGumboDefaultOptions
extern

Default options struct; use this with gumbo_parse_with_options.

◆ kGumboEmptySourcePosition

const GumboSourcePosition kGumboEmptySourcePosition
extern

A SourcePosition used for elements that have no source position, i.e. parser-inserted elements.

◆ kGumboEmptyString

const GumboStringPiece kGumboEmptyString
extern

A constant to represent a 0-length null string.

◆ kGumboEmptyVector

const GumboVector kGumboEmptyVector
extern

An empty (0-length, 0-capacity) GumboVector.