Cogs.Core
error.h
1// Copyright 2010 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Author: jdtang@google.com (Jonathan Tang)
16//
17// Error types, enums, and handling functions.
18
19#ifndef GUMBO_ERROR_H_
20#define GUMBO_ERROR_H_
21#ifdef _MSC_VER
22#ifndef _CRT_SECURE_NO_WARNINGS
23#define _CRT_SECURE_NO_WARNINGS
24#endif
25#endif
26#include <stdint.h>
27
28#include "gumbo.h"
29#include "insertion_mode.h"
30#include "string_buffer.h"
31#include "token_type.h"
32
33#ifdef __cplusplus
34extern "C" {
35#endif
36
38
39typedef enum {
40 GUMBO_ERR_UTF8_INVALID,
41 GUMBO_ERR_UTF8_TRUNCATED,
42 GUMBO_ERR_UTF8_NULL,
43 GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS,
44 GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON,
45 GUMBO_ERR_NUMERIC_CHAR_REF_INVALID,
46 GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON,
47 GUMBO_ERR_NAMED_CHAR_REF_INVALID,
48 GUMBO_ERR_TAG_STARTS_WITH_QUESTION,
49 GUMBO_ERR_TAG_EOF,
50 GUMBO_ERR_TAG_INVALID,
51 GUMBO_ERR_CLOSE_TAG_EMPTY,
52 GUMBO_ERR_CLOSE_TAG_EOF,
53 GUMBO_ERR_CLOSE_TAG_INVALID,
54 GUMBO_ERR_SCRIPT_EOF,
55 GUMBO_ERR_ATTR_NAME_EOF,
56 GUMBO_ERR_ATTR_NAME_INVALID,
57 GUMBO_ERR_ATTR_DOUBLE_QUOTE_EOF,
58 GUMBO_ERR_ATTR_SINGLE_QUOTE_EOF,
59 GUMBO_ERR_ATTR_UNQUOTED_EOF,
60 GUMBO_ERR_ATTR_UNQUOTED_RIGHT_BRACKET,
61 GUMBO_ERR_ATTR_UNQUOTED_EQUALS,
62 GUMBO_ERR_ATTR_AFTER_EOF,
63 GUMBO_ERR_ATTR_AFTER_INVALID,
64 GUMBO_ERR_DUPLICATE_ATTR,
65 GUMBO_ERR_SOLIDUS_EOF,
66 GUMBO_ERR_SOLIDUS_INVALID,
67 GUMBO_ERR_DASHES_OR_DOCTYPE,
68 GUMBO_ERR_COMMENT_EOF,
69 GUMBO_ERR_COMMENT_INVALID,
70 GUMBO_ERR_COMMENT_BANG_AFTER_DOUBLE_DASH,
71 GUMBO_ERR_COMMENT_DASH_AFTER_DOUBLE_DASH,
72 GUMBO_ERR_COMMENT_SPACE_AFTER_DOUBLE_DASH,
73 GUMBO_ERR_COMMENT_END_BANG_EOF,
74 GUMBO_ERR_DOCTYPE_EOF,
75 GUMBO_ERR_DOCTYPE_INVALID,
76 GUMBO_ERR_DOCTYPE_SPACE,
77 GUMBO_ERR_DOCTYPE_RIGHT_BRACKET,
78 GUMBO_ERR_DOCTYPE_SPACE_OR_RIGHT_BRACKET,
79 GUMBO_ERR_DOCTYPE_END,
80 GUMBO_ERR_PARSER,
81 GUMBO_ERR_UNACKNOWLEDGED_SELF_CLOSING_TAG,
82} GumboErrorType;
83
84// Additional data for duplicated attributes.
86 // The name of the attribute. Owned by this struct.
87 const char* name;
88
89 // The (0-based) index within the attributes vector of the original
90 // occurrence.
91 unsigned int original_index;
92
93 // The (0-based) index where the new occurrence would be.
94 unsigned int new_index;
96
97// A simplified representation of the tokenizer state, designed to be more
98// useful to clients of this library than the internal representation. This
99// condenses the actual states used in the tokenizer state machine into a few
100// values that will be familiar to users of HTML.
101typedef enum {
102 GUMBO_ERR_TOKENIZER_DATA,
103 GUMBO_ERR_TOKENIZER_CHAR_REF,
104 GUMBO_ERR_TOKENIZER_RCDATA,
105 GUMBO_ERR_TOKENIZER_RAWTEXT,
106 GUMBO_ERR_TOKENIZER_PLAINTEXT,
107 GUMBO_ERR_TOKENIZER_SCRIPT,
108 GUMBO_ERR_TOKENIZER_TAG,
109 GUMBO_ERR_TOKENIZER_SELF_CLOSING_TAG,
110 GUMBO_ERR_TOKENIZER_ATTR_NAME,
111 GUMBO_ERR_TOKENIZER_ATTR_VALUE,
112 GUMBO_ERR_TOKENIZER_MARKUP_DECLARATION,
113 GUMBO_ERR_TOKENIZER_COMMENT,
114 GUMBO_ERR_TOKENIZER_DOCTYPE,
115 GUMBO_ERR_TOKENIZER_CDATA,
116} GumboTokenizerErrorState;
117
118// Additional data for tokenizer errors.
119// This records the current state and codepoint encountered - this is usually
120// enough to reconstruct what went wrong and provide a friendly error message.
122 // The bad codepoint encountered.
123 int codepoint;
124
125 // The state that the tokenizer was in at the time.
126 GumboTokenizerErrorState state;
128
129// Additional data for parse errors.
131 // The type of input token that resulted in this error.
132 GumboTokenType input_type;
133
134 // The HTML tag of the input token. TAG_UNKNOWN if this was not a tag token.
135 GumboTag input_tag;
136
137 // The insertion mode that the parser was in at the time.
138 GumboInsertionMode parser_state;
139
140 // The tag stack at the point of the error. Note that this is an GumboVector
141 // of GumboTag's *stored by value* - cast the void* to an GumboTag directly to
142 // get at the tag.
143 GumboVector /* GumboTag */ tag_stack;
145
146// The overall error struct representing an error in decoding/tokenizing/parsing
147// the HTML. This contains an enumerated type flag, a source position, and then
148// a union of fields containing data specific to the error.
149typedef struct GumboInternalError {
150 // The type of error.
151 GumboErrorType type;
152
153 // The position within the source file where the error occurred.
154 GumboSourcePosition position;
155
156 // A pointer to the byte within the original source file text where the error
157 // occurred (note that this is not the same as position.offset, as that gives
158 // character-based instead of byte-based offsets).
159 const char* original_text;
160
161 // Type-specific error information.
162 union {
163 // The code point we encountered, for:
164 // * GUMBO_ERR_UTF8_INVALID
165 // * GUMBO_ERR_UTF8_TRUNCATED
166 // * GUMBO_ERR_NUMERIC_CHAR_REF_WITHOUT_SEMICOLON
167 // * GUMBO_ERR_NUMERIC_CHAR_REF_INVALID
168 uint64_t codepoint;
169
170 // Tokenizer errors.
171 GumboTokenizerError tokenizer;
172
173 // Short textual data, for:
174 // * GUMBO_ERR_NAMED_CHAR_REF_WITHOUT_SEMICOLON
175 // * GUMBO_ERR_NAMED_CHAR_REF_INVALID
176 GumboStringPiece text;
177
178 // Duplicate attribute data, for GUMBO_ERR_DUPLICATE_ATTR.
179 GumboDuplicateAttrError duplicate_attr;
180
181 // Parser state, for GUMBO_ERR_PARSER and
182 // GUMBO_ERR_UNACKNOWLEDGE_SELF_CLOSING_TAG.
183 struct GumboInternalParserError parser;
184 } v;
185} GumboError;
186
187// Adds a new error to the parser's error list, and returns a pointer to it so
188// that clients can fill out the rest of its fields. May return NULL if we're
189// already over the max_errors field specified in GumboOptions.
190GumboError* gumbo_add_error(struct GumboInternalParser* parser);
191
192// Initializes the errors vector in the parser.
193void gumbo_init_errors(struct GumboInternalParser* errors);
194
195// Frees all the errors in the 'errors_' field of the parser.
196void gumbo_destroy_errors(struct GumboInternalParser* errors);
197
198// Frees the memory used for a single GumboError.
199void gumbo_error_destroy(struct GumboInternalParser* parser, GumboError* error);
200
201// Prints an error to a string. This fills an empty GumboStringBuffer with a
202// freshly-allocated buffer containing the error message text. The caller is
203// responsible for deleting the buffer. (Note that the buffer is allocated with
204// the allocator specified in the GumboParser config and hence should be freed
205// by gumbo_parser_deallocate().)
206void gumbo_error_to_string(struct GumboInternalParser* parser,
207 const GumboError* error, GumboStringBuffer* output);
208
209// Prints a caret diagnostic to a string. This fills an empty GumboStringBuffer
210// with a freshly-allocated buffer containing the error message text. The
211// caller is responsible for deleting the buffer. (Note that the buffer is
212// allocated with the allocator specified in the GumboParser config and hence
213// should be freed by gumbo_parser_deallocate().)
214void gumbo_caret_diagnostic_to_string(struct GumboInternalParser* parser,
215 const GumboError* error, const char* source_text,
216 GumboStringBuffer* output);
217
218// Like gumbo_caret_diagnostic_to_string, but prints the text to stdout instead
219// of writing to a string.
220void gumbo_print_caret_diagnostic(struct GumboInternalParser* parser,
221 const GumboError* error, const char* source_text);
222
223#ifdef __cplusplus
224}
225#endif
226
227#endif // GUMBO_ERROR_H_
GumboTag
Definition: gumbo.h:158