MediaWiki:TextCleaner.js
Jump to navigation
Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
Note: After saving, you may have to bypass your browser's cache to see the changes.
- Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (⌘-R on a Mac)
- Google Chrome: Press Ctrl-Shift-R (⌘-Shift-R on a Mac)
- Internet Explorer / Edge: Hold Ctrl while clicking Refresh, or press Ctrl-F5
- Opera: Go to Menu → Settings (Opera → Preferences on a Mac) and then to Privacy & security → Clear browsing data → Cached images and files.
// <source lang="javascript">
/*
Wikitext sanitation for MediaWiki
Author: [[User:Lupo]], January 2008
License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)
Choose whichever license of these you like best :-)
*/
/*jshint curly:false, eqnull:true, laxbreak:true */
var TextCleaner = {
imgNamespaceNames : null,
// This function attempts to construct well-formed wikitext from input that may contain
// possibly broken wikitext.
//
// Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
// of templates, and due to the fact that image thumbnail captions may themselves contain
// links. This implementation catches the most common errors (such as forgetting to close a
// template or a link), and even some more elaborate ones. With enough malice, this sanitation
// can still be broken by user input such that the result is not well-formed wikitext as the
// parser at the servers would like to have it. (It's still possible that the result is broken
// wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
// into broken wikitext.)
//
// If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
// image link was a thumbnail or had a width smaller than 300px specified.
//
// WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
// probably rather inefficient due to the many substrings that are generated. This function is
// primarily intended to be used to clean up user input in forms, which are typically rather
// short.
sanitizeWikiText : function (input, only_thumbs) {
if (input.search(/[\][}{]|<nowiki(\s[^>]*)?>|<\!--/) < 0) return input;
// No critical characters
if (!TextCleaner.imgNamespaceNames) {
TextCleaner.imgNamespaceNames = [];
var namespaceIds = mw.config.get('wgNamespaceIds');
if (namespaceIds) {
for (var name in namespaceIds) {
if (namespaceIds[name] == 6) // Image namespace
TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = name;
}
}
// Make sure that we have the two canonical names
TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = 'Image';
TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = 'File';
// If your Wiki does not have wgNamespaceIds, add aliases or localized namespace names here!
}
var consumed = [0, 0];
// For image captions. Image caption may contain links, and may even contain images.
// The current MediaWiki parser actually allows this only once. For deeper recursions,
// it fails. But here, it's actually easier to implement no limit.
var base_regexp =
new RegExp
( "[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]"
+ "|\<nowiki(\\s[^>]*)?\>|\<\!--",
"i"); // Ignore case
var nowiki_regexp = new RegExp("\<\\/nowiki(\\s[^>]*)?\>|\<\!--", "i");
var allow_only_thumbs = only_thumbs;
function sanitize
(s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries) {
if (!s || s.length === 0) {
if (caption_level > 0) {
if (consumed.length < caption_level)
consumed.push(0);
else
consumed[caption_level-1] = 0;
}
return s;
}
var result = "";
var initial_length = s.length;
var get_out = false;
var in_nowiki = false;
var endings = null;
// Stack recording template and table nesting
var next;
function push_end (val) {
if (endings == null) {
endings = new Array (1);
endings[0] = val;
} else {
endings[endings.length] = val;
}
}
function pop_end () {
if (endings == null) return null; // Shouldn't happen
var result;
if (endings.length == 1) {
result = endings[0];
endings = null;
} else {
result = endings[endings.length -1];
endings.length = endings.length - 1;
}
return result;
}
regexp = base_regexp;
while (s.length > 0 && !get_out) {
next = s.search(regexp);
if (next < 0) {
result = result + s;
break;
}
var ch = s.charAt(next);
var i = -1;
var j = -1;
var k = -1;
switch (ch) {
case '<':
// Nowiki or HTML comment. Must be closed.
if (s.charAt(next+1) == '!') {
// HTML comment. Cannot be nested.
i = s.indexOf('--\>', next + 3);
if (i < 0) {
result = result + s + '--\>';
s = "";
} else {
result = result + s.substring(0, i + 3);
s = s.substring(i + 3);
}
} else if (s.charAt(next + 1) == 'n') {
// Nowiki may contain HTML comments!
in_nowiki = true;
regexp = nowiki_regexp;
result = result + s.substring(0, next + 7);
s = s.substring(next + 7);
} else {
// End of nowiki. Searched for and found only if in_nowiki == true
in_nowiki = false;
regexp = base_regexp;
i = s.indexOf('>', next+1); // End of tag
result = result + s.substring(0, i+1);
s = s.substring(i+1);
}
break;
case '\x05':
// Table start
if (!with_tables) {
result = result + s.substring(0, next);
get_out = true;
break;
}
// Fall through
case '\x07':
if (ch == '\x07' && !with_galleries) {
result = result + s.substring(0, next);
get_out = true;
break;
}
case '\x01':
// Start of template, table, or gallery
result = result + s.substring(0, next+1);
push_end (String.fromCharCode(ch.charCodeAt(0)+1).charAt(0));
s = s.substring(next+1);
break;
case '\x06':
// Table end
if (break_at_pipe && endings == null) {
result = result + s.substring(0, next);
get_out = true;
break;
}
// Fall through
case '\x02':
// End of a template or table
result = result + s.substring(0, next);
if (endings == null || endings[endings.length - 1] != ch) {
// Spurious template or table end
if (ch == '\x02')
result = result + '}}';
else
result = result + '|}';
} else {
result = result + pop_end ();
}
s = s.substring(next+1);
break;
case '\x08':
// End of gallery
result = result + s.substring(0, next+1);
if (endings != null && endings[endings.length - 1] == ch) pop_end ();
s = s.substring(next+1);
break;
case '\x03':
case '[':
{
if (!with_links && endings == null) {
get_out = true;
break;
}
// Image links must be treated specially, since they may contain nested links
// in the caption!
var initial = null; // If set, it's 'image:' or 'file:' and we have an image link
i = next;
while (i < s.length && s.charAt(i) == ch) i++;
if (ch == '\x03' && i < s.length && s.charAt(i) == '[') i++;
function get_initial(i, s) {
for (var j = 0; j < TextCleaner.imgNamespaceNames.length; j++) {
if (s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1) {
var t = s.substr(i, TextCleaner.imgNamespaceNames[j].length + 1);
if (t.toLowerCase() == (TextCleaner.imgNamespaceNames[j].toLowerCase() + ':'))
return t;
}
}
return null;
}
initial = get_initial (i, s);
// Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
var lk_text = sanitize (s.substring(i),
false, // No links at top-level allowed
caption_level + 1,
false, // No thumbs
true, // Break at pipe
false, // No tables
false); // No galleries
var lk_text_length = consumed[caption_level];
j = i + lk_text_length;
if (j >= s.length) {
// Used up the whole text: [[Foo or [bar
if (initial != null && allow_only_thumbs)
// Should in any case have started with [[, not [
result = result + s.substring(0, i-1) + '\x03:' + initial
+ lk_text.substring(initial.length) + '\x04';
else
result = result + s.substring(0, i) + lk_text
+ ((s.charAt(i-1) == '[') ? ']' : '\x04');
s = "";
break;
}
if (s.charAt(j) == '|') k = j; else k = -1;
if (k < 0) {
// No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
if (initial != null && allow_only_thumbs)
// Should in any case have started with [[, not [
result = result + s.substring(0, i-1) + '\x03:' + initial
+ lk_text.substring(initial.length) + '\x04';
else
result = result + s.substring(0, i) + lk_text
+ ((s.charAt(i-1) == '[') ? ']' : '\x04');
if (s.charAt(j) == ']' || s.charAt(j) == '\x04') {
// Indeed closing the link
s = s.substring(j+1);
} else {
s = s.substring(j);
}
break;
} else {
var caption = null;
var used = 0;
// Pipe found.
if (initial == null) {
// Not an image link. Must be something like [[Foo|Bar]].
caption = sanitize(
s.substring(k + 1),
// No links, please
false,
caption_level + 1,
// No thumbs either
false,
// Don't care about pipes
false,
// Allow tables (yes, parser allows that!)
true,
// Allow galleries (?)
true
);
// Now we're at [[, [, ]], or ]
used = consumed[caption_level];
result = result + s.substring(0, i) + lk_text + '|' + caption
+ ((s.charAt(i-1) == '[') ? ']' : '\x04');
} else {
var q = s.substring(k);
// We assume that there are no templates, nowikis, and other nasty things
// in the parameters. Search forward until the next [, {, ], }
l = q.search(/[\x01\x02\x03[\x04\]\{\}\x05\x06\x07\x08]/);
if (l < 0) l = q.length;
if (l+1 < q.length) q = q.substring(0, l+1);
var is_thumb = q.search(/\|\s*thumb(nail)?\s*[\|\x04]/) >= 0;
var img_width = /\|\s*(\d+)px\s*[\|\x04]/.exec(q);
if (img_width && img_width.length > 1) {
img_width = parseInt (img_width[1], 10);
if (isNaN (img_width)) img_width = null;
} else
img_width = null;
if (img_width === null) img_width = is_thumb ? 180 : 301;
var is_small = img_width <= 300;
// Caption starts at the last pipe before l. If that is a parameter,
// it doesn't hurt.
var m = k + q.lastIndexOf('|', l);
caption = sanitize(
s.substring(m+1),
// Allow links only if it's a thumb
is_thumb,
caption_level + 1,
allow_thumbs && is_thumb,
// Don't break at pipe
false,
// Tables only if it's a thumb
is_thumb,
// Allow galleries for thumbs (?)
is_thumb
);
used = consumed[caption_level];
// caption used 'used' chars from m+1, s.charAt(m+1+used) == '\x04'
is_thumb = allow_thumbs && is_small;
if (is_thumb || !allow_only_thumbs)
result = result + s.substring(0, i-1) + '\x03' + lk_text ;
else
result = result + s.substring(0, i-1) + '\x03:' + initial
+ lk_text.substring(initial.length);
result = result + s.substring(k, m+1) + caption + '\x04';
k = m;
}
next = k+1+used;
if (next < s.length) {
if (s.charAt(next) != '\x04')
s = s.substring(next);
else
s = s.substring(next+1);
} else
s = "";
}
break;
}
case '\x04':
case ']':
// Extra bracket.
result = result + s.substring(0, next);
if (caption_level === 0 && !break_at_pipe) {
result = result + (ch == ']' ? ']' : ']]');
s = s.substring(next+1);
} else
get_out = true;
break;
case '|':
result = result + s.substring(0, next);
if (break_at_pipe && endings == null) {
// Pipe character at top level
get_out = true;
} else {
if (caption_level === 0 && !break_at_pipe && endings == null)
result = result + '|'; // Top-level pipe character
else
result = result + '|';
s = s.substring(next+1);
}
break;
} // end switch
} // end while
if (in_nowiki) result = result + "\<\/nowiki>"; // Make sure this nowiki is closed.
// Close open templates and tables
while (endings != null) {
ch = pop_end();
result = result + (ch == '\x06' ? '\n' : "") + ch;
}
if (caption_level > 0) {
var used_up = initial_length - (get_out ? (s.length - next) : 0);
if (consumed.length < caption_level)
consumed[consumed.length] = used_up;
else
consumed[caption_level-1] = used_up;
}
return result;
}
// Replace multi-character tokens by one-character placeholders, simplifying the
// subsequent processing.
var s = input.replace(/\{\{/g, '\x01')
.replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end
.replace(/\}\}/g, '\x02')
.replace(/\[\[/g, '\x03')
.replace(/\]\]/g, '\x04')
.replace(/\n\s*\{\|/g, '\n\x05') // Table start and end must be on own line
.replace(/^\s*\{\|/, '\x05') // Table start at the very beginning
.replace(/\n\s*\|\}/g, '\n\x06') // (we strip leading whitespace)
.replace(/\<\s*gallery\s*\>/g, '\x07')
.replace(/\<\/\s*gallery\s*\>/g, '\x08');
s = sanitize(s, true, 0, true, false, true, true);
// with links, allow thumbs, don't break at pipe, allow tables, allow galleries
return s.replace(/\x01/g, '\{\{')
.replace(/\x02/g, '\}\}')
.replace(/\x03/g, '\[\[')
.replace(/\x04/g, '\]\]')
.replace(/\x05/g, '\{\|')
.replace(/\x06/g, '\|\}')
.replace(/\x07/g, '<gallery>')
.replace(/\x08/g, '</gallery>');
}
};
// </source>