donphan/app/javascript/glitch/util/bio_metadata.js

332 lines
11 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
`util/bio_metadata`
===================
> For more information on the contents of this file, please contact:
>
> - kibigo! [@kibi@glitch.social]
This file provides two functions for dealing with bio metadata. The
functions are:
- __`processBio(content)` :__
Processes `content` to extract any frontmatter. The returned
object has two properties: `text`, which contains the text of
`content` sans-frontmatter, and `metadata`, which is an array
of key-value pairs (in two-element array format). If no
frontmatter was provided in `content`, then `metadata` will be
an empty array.
- __`createBio(note, data)` :__
Reverses the process in `processBio()`; takes a `note` and an
array of two-element arrays (which should give keys and values)
and outputs a string containing a well-formed bio with
frontmatter.
*/
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
/*********************************************************************\
To my lovely code maintainers,
The syntax recognized by the Mastodon frontend for its bio metadata
feature is a subset of that provided by the YAML 1.2 specification.
In particular, Mastodon recognizes metadata which is provided as an
implicit YAML map, where each key-value pair takes up only a single
line (no multi-line values are permitted). To simplify the level of
processing required, Mastodon metadata frontmatter has been limited
to only allow those characters in the `c-printable` set, as defined
by the YAML 1.2 specification, instead of permitting those from the
`nb-json` characters inside double-quoted strings like YAML proper.
¶ It is important to note that Mastodon only borrows the *syntax*
of YAML, not its semantics. This is to say, Mastodon won't make any
attempt to interpret the data it receives. `true` will not become a
boolean; `56` will not be interpreted as a number. Rather, each key
and every value will be read as a string, and as a string they will
remain. The order of the pairs is unchanged, and any duplicate keys
are preserved. However, YAML escape sequences will be replaced with
the proper interpretations according to the YAML 1.2 specification.
¶ The implementation provided below interprets `<br>` as `\n` and
allows for an open <p> tag at the beginning of the bio. It replaces
the escaped character entities `&apos;` and `&quot;` with single or
double quotes, respectively, prior to processing. However, no other
escaped characters are replaced, not even those which might have an
impact on the syntax otherwise. These minor allowances are provided
because the Mastodon backend will insert these things automatically
into a bio before sending it through the API, so it is important we
account for them. Aside from this, the YAML frontmatter must be the
very first thing in the bio, leading with three consecutive hyphen-
minues (`---`), and ending with the same or, alternatively, instead
with three periods (`...`). No limits have been set with respect to
the number of characters permitted in the frontmatter, although one
should note that only limited space is provided for them in the UI.
¶ The regular expression used to check the existence of, and then
process, the YAML frontmatter has been split into a number of small
components in the code below, in the vain hope that it will be much
easier to read and to maintain. I leave it to the future readers of
this code to determine the extent of my successes in this endeavor.
UPDATE 19 Oct 2017: We no longer allow character escapes inside our
double-quoted strings for ease of processing. We now internally use
the name "ƔAML" in our code to clarify that this is Not Quite YAML.
Sending love + warmth eternal,
- kibigo [@kibi@glitch.social]
\*********************************************************************/
/* "u" FLAG COMPATABILITY */
let compat_mode = false;
try {
new RegExp('.', 'u');
} catch (e) {
compat_mode = true;
}
/* CONVENIENCE FUNCTIONS */
const unirex = str => compat_mode ? new RegExp(str) : new RegExp(str, 'u');
const rexstr = exp => '(?:' + exp.source + ')';
/* CHARACTER CLASSES */
const DOCUMENT_START = /^/;
const DOCUMENT_END = /$/;
const ALLOWED_CHAR = unirex( // `c-printable` in the YAML 1.2 spec.
compat_mode ? '[\t\n\r\x20-\x7e\x85\xa0-\ufffd]' : '[\t\n\r\x20-\x7e\x85\xa0-\ud7ff\ue000-\ufffd\u{10000}-\u{10FFFF}]'
);
const WHITE_SPACE = /[ \t]/;
const LINE_BREAK = /\r?\n|\r|<br\s*\/?>/;
const INDICATOR = /[-?:,[\]{}&#*!|>'"%@`]/;
const FLOW_CHAR = /[,[\]{}]/;
/* NEGATED CHARACTER CLASSES */
const NOT_WHITE_SPACE = unirex('(?!' + rexstr(WHITE_SPACE) + ')[^]');
const NOT_LINE_BREAK = unirex('(?!' + rexstr(LINE_BREAK) + ')[^]');
const NOT_INDICATOR = unirex('(?!' + rexstr(INDICATOR) + ')[^]');
const NOT_FLOW_CHAR = unirex('(?!' + rexstr(FLOW_CHAR) + ')[^]');
const NOT_ALLOWED_CHAR = unirex(
'(?!' + rexstr(ALLOWED_CHAR) + ')[^]'
);
/* BASIC CONSTRUCTS */
const ANY_WHITE_SPACE = unirex(rexstr(WHITE_SPACE) + '*');
const ANY_ALLOWED_CHARS = unirex(rexstr(ALLOWED_CHAR) + '*');
const NEW_LINE = unirex(
rexstr(ANY_WHITE_SPACE) + rexstr(LINE_BREAK)
);
const SOME_NEW_LINES = unirex(
'(?:' + rexstr(NEW_LINE) + ')+'
);
const POSSIBLE_STARTS = unirex(
rexstr(DOCUMENT_START) + rexstr(/<p[^<>]*>/) + '?'
);
const POSSIBLE_ENDS = unirex(
rexstr(SOME_NEW_LINES) + '|' +
rexstr(DOCUMENT_END) + '|' +
rexstr(/<\/p>/)
);
const QUOTE_CHAR = unirex(
'(?=' + rexstr(NOT_LINE_BREAK) + ')[^"]'
);
const ANY_QUOTE_CHAR = unirex(
rexstr(QUOTE_CHAR) + '*'
);
const ESCAPED_APOS = unirex(
'(?=' + rexstr(NOT_LINE_BREAK) + ')' + rexstr(/[^']|''/)
);
const ANY_ESCAPED_APOS = unirex(
rexstr(ESCAPED_APOS) + '*'
);
const FIRST_KEY_CHAR = unirex(
'(?=' + rexstr(NOT_LINE_BREAK) + ')' +
'(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
rexstr(NOT_INDICATOR) + '|' +
rexstr(/[?:-]/) +
'(?=' + rexstr(NOT_LINE_BREAK) + ')' +
'(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
'(?=' + rexstr(NOT_FLOW_CHAR) + ')'
);
const FIRST_VALUE_CHAR = unirex(
'(?=' + rexstr(NOT_LINE_BREAK) + ')' +
'(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
rexstr(NOT_INDICATOR) + '|' +
rexstr(/[?:-]/) +
'(?=' + rexstr(NOT_LINE_BREAK) + ')' +
'(?=' + rexstr(NOT_WHITE_SPACE) + ')'
// Flow indicators are allowed in values.
);
const LATER_KEY_CHAR = unirex(
rexstr(WHITE_SPACE) + '|' +
'(?=' + rexstr(NOT_LINE_BREAK) + ')' +
'(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
'(?=' + rexstr(NOT_FLOW_CHAR) + ')' +
rexstr(/[^:#]#?/) + '|' +
rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
);
const LATER_VALUE_CHAR = unirex(
rexstr(WHITE_SPACE) + '|' +
'(?=' + rexstr(NOT_LINE_BREAK) + ')' +
'(?=' + rexstr(NOT_WHITE_SPACE) + ')' +
// Flow indicators are allowed in values.
rexstr(/[^:#]#?/) + '|' +
rexstr(/:/) + '(?=' + rexstr(NOT_WHITE_SPACE) + ')'
);
/* YAML CONSTRUCTS */
const ƔAML_START = unirex(
rexstr(ANY_WHITE_SPACE) + '---'
);
const ƔAML_END = unirex(
rexstr(ANY_WHITE_SPACE) + '(?:---|\.\.\.)'
);
const ƔAML_LOOKAHEAD = unirex(
'(?=' +
rexstr(ƔAML_START) +
rexstr(ANY_ALLOWED_CHARS) + rexstr(NEW_LINE) +
rexstr(ƔAML_END) + rexstr(POSSIBLE_ENDS) +
')'
);
const ƔAML_DOUBLE_QUOTE = unirex(
'"' + rexstr(ANY_QUOTE_CHAR) + '"'
);
const ƔAML_SINGLE_QUOTE = unirex(
'\'' + rexstr(ANY_ESCAPED_APOS) + '\''
);
const ƔAML_SIMPLE_KEY = unirex(
rexstr(FIRST_KEY_CHAR) + rexstr(LATER_KEY_CHAR) + '*'
);
const ƔAML_SIMPLE_VALUE = unirex(
rexstr(FIRST_VALUE_CHAR) + rexstr(LATER_VALUE_CHAR) + '*'
);
const ƔAML_KEY = unirex(
rexstr(ƔAML_DOUBLE_QUOTE) + '|' +
rexstr(ƔAML_SINGLE_QUOTE) + '|' +
rexstr(ƔAML_SIMPLE_KEY)
);
const ƔAML_VALUE = unirex(
rexstr(ƔAML_DOUBLE_QUOTE) + '|' +
rexstr(ƔAML_SINGLE_QUOTE) + '|' +
rexstr(ƔAML_SIMPLE_VALUE)
);
const ƔAML_SEPARATOR = unirex(
rexstr(ANY_WHITE_SPACE) +
':' + rexstr(WHITE_SPACE) +
rexstr(ANY_WHITE_SPACE)
);
const ƔAML_LINE = unirex(
'(' + rexstr(ƔAML_KEY) + ')' +
rexstr(ƔAML_SEPARATOR) +
'(' + rexstr(ƔAML_VALUE) + ')'
);
/* FRONTMATTER REGEX */
const ƔAML_FRONTMATTER = unirex(
rexstr(POSSIBLE_STARTS) +
rexstr(ƔAML_LOOKAHEAD) +
rexstr(ƔAML_START) + rexstr(SOME_NEW_LINES) +
'(?:' +
rexstr(ANY_WHITE_SPACE) + rexstr(ƔAML_LINE) + rexstr(SOME_NEW_LINES) +
'){0,5}' +
rexstr(ƔAML_END) + rexstr(POSSIBLE_ENDS)
);
/* SEARCHES */
const FIND_ƔAML_LINE = unirex(
rexstr(NEW_LINE) + rexstr(ANY_WHITE_SPACE) + rexstr(ƔAML_LINE)
);
/* STRING PROCESSING */
function processString (str) {
switch (str.charAt(0)) {
case '"':
return str.substring(1, str.length - 1);
case '\'':
return str
.substring(1, str.length - 1)
.replace(/''/g, '\'');
default:
return str;
}
}
/* BIO PROCESSING */
export function processBio(content) {
content = content.replace(/&quot;/g, '"').replace(/&apos;/g, '\'');
let result = {
text: content,
metadata: [],
};
let ɣaml = content.match(ƔAML_FRONTMATTER);
if (!ɣaml) {
return result;
} else {
ɣaml = ɣaml[0];
}
const start = content.search(ƔAML_START);
const end = start + ɣaml.length - ɣaml.search(ƔAML_START);
result.text = content.substr(end);
let metadata = null;
let query = new RegExp(rexstr(FIND_ƔAML_LINE), 'g'); // Some browsers don't allow flags unless both args are strings
while ((metadata = query.exec(ɣaml))) {
result.metadata.push([
processString(metadata[1]),
processString(metadata[2]),
]);
}
return result;
}
/* BIO CREATION */
export function createBio(note, data) {
if (!note) note = '';
let frontmatter = '';
if ((data && data.length) || note.match(/^\s*---\s+/)) {
if (!data) frontmatter = '---\n...\n';
else {
frontmatter += '---\n';
for (let i = 0; i < data.length; i++) {
let key = '' + data[i][0];
let val = '' + data[i][1];
// Key processing
if (key === (key.match(ƔAML_SIMPLE_KEY) || [])[0]) /* do nothing */;
else if (key === (key.match(ANY_QUOTE_CHAR) || [])[0]) key = '"' + key + '"';
else {
key = key
.replace(/'/g, '\'\'')
.replace(new RegExp(rexstr(NOT_ALLOWED_CHAR), compat_mode ? 'g' : 'gu'), '<27>');
key = '\'' + key + '\'';
}
// Value processing
if (val === (val.match(ƔAML_SIMPLE_VALUE) || [])[0]) /* do nothing */;
else if (val === (val.match(ANY_QUOTE_CHAR) || [])[0]) val = '"' + val + '"';
else {
key = key
.replace(/'/g, '\'\'')
.replace(new RegExp(rexstr(NOT_ALLOWED_CHAR), compat_mode ? 'g' : 'gu'), '<27>');
key = '\'' + key + '\'';
}
frontmatter += key + ': ' + val + '\n';
}
frontmatter += '...\n';
}
}
return frontmatter + note;
}