summaryrefslogtreecommitdiffstats
path: root/depends/hoedown/src/escape.c
diff options
context:
space:
mode:
Diffstat (limited to 'depends/hoedown/src/escape.c')
-rw-r--r--depends/hoedown/src/escape.c188
1 files changed, 188 insertions, 0 deletions
diff --git a/depends/hoedown/src/escape.c b/depends/hoedown/src/escape.c
new file mode 100644
index 00000000..b4a275ba
--- /dev/null
+++ b/depends/hoedown/src/escape.c
@@ -0,0 +1,188 @@
+#include "hoedown/escape.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#define likely(x) __builtin_expect((x),1)
+#define unlikely(x) __builtin_expect((x),0)
+
+
+/*
+ * The following characters will not be escaped:
+ *
+ * -_.+!*'(),%#@?=;:/,+&$ alphanum
+ *
+ * Note that this character set is the addition of:
+ *
+ * - The characters which are safe to be in an URL
+ * - The characters which are *not* safe to be in
+ * an URL because they are RESERVED characters.
+ *
+ * We assume (lazily) that any RESERVED char that
+ * appears inside an URL is actually meant to
+ * have its native function (i.e. as an URL
+ * component/separator) and hence needs no escaping.
+ *
+ * There are two exceptions: the chacters & (amp)
+ * and ' (single quote) do not appear in the table.
+ * They are meant to appear in the URL as components,
+ * yet they require special HTML-entity escaping
+ * to generate valid HTML markup.
+ *
+ * All other characters will be escaped to %XX.
+ *
+ */
+static const uint8_t HREF_SAFE[UINT8_MAX+1] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+void
+hoedown_escape_href(hoedown_buffer *ob, const uint8_t *data, size_t size)
+{
+ static const char hex_chars[] = "0123456789ABCDEF";
+ size_t i = 0, mark;
+ char hex_str[3];
+
+ hex_str[0] = '%';
+
+ while (i < size) {
+ mark = i;
+ while (i < size && HREF_SAFE[data[i]]) i++;
+
+ /* Optimization for cases where there's nothing to escape */
+ if (mark == 0 && i >= size) {
+ hoedown_buffer_put(ob, data, size);
+ return;
+ }
+
+ if (likely(i > mark)) {
+ hoedown_buffer_put(ob, data + mark, i - mark);
+ }
+
+ /* escaping */
+ if (i >= size)
+ break;
+
+ switch (data[i]) {
+ /* amp appears all the time in URLs, but needs
+ * HTML-entity escaping to be inside an href */
+ case '&':
+ HOEDOWN_BUFPUTSL(ob, "&amp;");
+ break;
+
+ /* the single quote is a valid URL character
+ * according to the standard; it needs HTML
+ * entity escaping too */
+ case '\'':
+ HOEDOWN_BUFPUTSL(ob, "&#x27;");
+ break;
+
+ /* the space can be escaped to %20 or a plus
+ * sign. we're going with the generic escape
+ * for now. the plus thing is more commonly seen
+ * when building GET strings */
+#if 0
+ case ' ':
+ hoedown_buffer_putc(ob, '+');
+ break;
+#endif
+
+ /* every other character goes with a %XX escaping */
+ default:
+ hex_str[1] = hex_chars[(data[i] >> 4) & 0xF];
+ hex_str[2] = hex_chars[data[i] & 0xF];
+ hoedown_buffer_put(ob, (uint8_t *)hex_str, 3);
+ }
+
+ i++;
+ }
+}
+
+
+/**
+ * According to the OWASP rules:
+ *
+ * & --> &amp;
+ * < --> &lt;
+ * > --> &gt;
+ * " --> &quot;
+ * ' --> &#x27; &apos; is not recommended
+ * / --> &#x2F; forward slash is included as it helps end an HTML entity
+ *
+ */
+static const uint8_t HTML_ESCAPE_TABLE[UINT8_MAX+1] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 1, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 4,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 6, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const char *HTML_ESCAPES[] = {
+ "",
+ "&quot;",
+ "&amp;",
+ "&#39;",
+ "&#47;",
+ "&lt;",
+ "&gt;"
+};
+
+void
+hoedown_escape_html(hoedown_buffer *ob, const uint8_t *data, size_t size, int secure)
+{
+ size_t i = 0, mark;
+
+ while (1) {
+ mark = i;
+ while (i < size && HTML_ESCAPE_TABLE[data[i]] == 0) i++;
+
+ /* Optimization for cases where there's nothing to escape */
+ if (mark == 0 && i >= size) {
+ hoedown_buffer_put(ob, data, size);
+ return;
+ }
+
+ if (likely(i > mark))
+ hoedown_buffer_put(ob, data + mark, i - mark);
+
+ if (i >= size) break;
+
+ /* The forward slash is only escaped in secure mode */
+ if (!secure && data[i] == '/') {
+ hoedown_buffer_putc(ob, '/');
+ } else {
+ hoedown_buffer_puts(ob, HTML_ESCAPES[HTML_ESCAPE_TABLE[data[i]]]);
+ }
+
+ i++;
+ }
+}