MediaWiki:TextCleaner.js

Note: After publishing, you may have to bypass your browser's cache to see the changes.
Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (⌘-R on a Mac)
Google Chrome: Press Ctrl-Shift-R (⌘-Shift-R on a Mac)
Internet Explorer / Edge: Hold Ctrl while clicking Refresh, or press Ctrl-F5
Opera: Press Ctrl-F5.
/**
Wikitext sanitation for MediaWiki

Author: [[User:Lupo]], January 2008
License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)

Choose whichever license of these you like best :-)
*/
// <nowiki>
/* global mw:false, TextCleaner:true */
/* eslint no-control-regex:0, one-var:0, vars-on-top:0, camelcase:0, curly:0, space-in-parens:0, computed-property-spacing:0, array-bracket-spacing:0 */
/* jshint curly:false, eqnull:true, laxbreak:true */
(function () {
'use strict';
window.TextCleaner = {

	imgNamespaceNames: null,

// This function attempts to construct well-formed wikitext from input that may contain
// possibly broken wikitext.
//
// Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
// of templates, and due to the fact that image thumbnail captions may themselves contain
// links. This implementation catches the most common errors (such as forgetting to close a
// template or a link), and even some more elaborate ones. With enough malice, this sanitation
// can still be broken by user input such that the result is not well-formed wikitext as the
// parser at the servers would like to have it. (It's still possible that the result is broken
// wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
// into broken wikitext.)
//
// If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
// image link was a thumbnail or had a width smaller than 300px specified.
//
// WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
// probably rather inefficient due to the many substrings that are generated. This function is
// primarily intended to be used to clean up user input in forms, which are typically rather
// short.
	sanitizeWikiText: function (input, only_thumbs) {
		if (input.search(/[\][}{]|<nowiki(\s[^>]*)?>|<!--/) < 0)
			return input;
	// No critical characters

		if (!TextCleaner.imgNamespaceNames) {
			TextCleaner.imgNamespaceNames = [];
			var namespaceIds = mw.config.get('wgNamespaceIds');
			if (namespaceIds) {
				for (var name in namespaceIds) {
					if (namespaceIds[name] === 6) // Image namespace
						TextCleaner.imgNamespaceNames.push(name);
				}
			}
		// Make sure that we have the two canonical names
			TextCleaner.imgNamespaceNames.push('Image');
			TextCleaner.imgNamespaceNames.push('File');
		// If your Wiki does not have wgNamespaceIds, add aliases or localized namespace names here!
		}

		var consumed = [0, 0];
	// For image captions. Image caption may contain links, and may even contain images.
	// The current MediaWiki parser actually allows this only once. For deeper recursions,
	// it fails. But here, it's actually easier to implement no limit.

		var base_regexp = new RegExp('[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]' +
			'|<nowiki(\\s[^>]*)?>|<!--',
		'i'); // Ignore case
		var nowiki_regexp = new RegExp('<\\/nowiki(\\s[^>]*)?>|<!--', 'i');
		var allow_only_thumbs = only_thumbs;

		function sanitize(s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries) {
			if (!s || !s.length) {
				if (caption_level > 0) {
					if (consumed.length < caption_level)
						consumed.push(0);
					else
						consumed[caption_level - 1] = 0;
				}
				return s;
			}

			var result = '';
			var ch = '';
			var initial_length = s.length;
			var get_out = false;
			var in_nowiki = false;
			var endings = null;
		// Stack recording template and table nesting
			var next;
			var regexp = base_regexp;

			function push_end(val) {
				if (!endings)
					endings = [val];
				else
					endings.push(val);
			}

			function pop_end() {
				if (!endings)
					return null; // Shouldn't happen
				var result;
				if (endings.length === 1) {
					result = endings[0];
					endings = null;
				} else {
					result = endings[endings.length - 1];
					endings.length--;
				}
				return result;
			}

			function get_initial(i, s) {
				for (var j = 0; j < TextCleaner.imgNamespaceNames.length; j++) {
					if (s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1) {
						var t = s.substr(i, TextCleaner.imgNamespaceNames[j].length + 1);
						if (t.toLowerCase() === (TextCleaner.imgNamespaceNames[j].toLowerCase() + ':'))
							return t;
					}
				}
				return null;
			}

			while (s.length > 0 && !get_out) {
				next = s.search(regexp);
				if (next < 0) {
					result += s;
					break;
				}
				ch = s.charAt(next);
				var i = -1;
				var j = -1;
				var k = -1;
				switch (ch) {
					case '<':
				// Nowiki or HTML comment. Must be closed.
						if (s.charAt(next + 1) === '!') {
					// HTML comment. Cannot be nested.
							i = s.indexOf('-->', next + 3);
							if (i < 0) {
								result += s + '-->';
								s = '';
							} else {
								result += s.substring(0, i + 3);
								s = s.substring(i + 3);
							}
						} else if (s.charAt(next + 1) === 'n') {
					// Nowiki may contain HTML comments!
							in_nowiki = true;
							regexp = nowiki_regexp;
							result += s.substring(0, next + 7);
							s = s.substring(next + 7);
						} else {
					// End of nowiki. Searched for and found only if in_nowiki === true
							in_nowiki = false;
							regexp = base_regexp;
							i = s.indexOf('>', next + 1); // End of tag
							result += s.substring(0, i + 1);
							s = s.substring(i + 1);
						}
						break;
					case '\x05':
				// Table start
						if (!with_tables) {
							result += s.substring(0, next);
							get_out = true;
							break;
						}
				/* fall through */
					case '\x07':
						if (ch === '\x07' && !with_galleries) {
							result += s.substring(0, next);
							get_out = true;
							break;
						}
				/* fall through */
					case '\x01':
				// Start of template, table, or gallery
						result += s.substring(0, next + 1);
						push_end(String.fromCharCode(ch.charCodeAt(0) + 1).charAt(0));
						s = s.substring(next + 1);
						break;
					case '\x06':
				// Table end
						if (break_at_pipe && !endings) {
							result += s.substring(0, next);
							get_out = true;
							break;
						}
				/* fall through */
					case '\x02':
				// End of a template or table
						result += s.substring(0, next);
						if (!endings || endings[endings.length - 1] !== ch) {
					// Spurious template or table end
							if (ch === '\x02')
								result += '&#x7D;&#x7D;';
							else
								result += '&#x7C;&#x7D;';
						} else {
							result += pop_end();
						}
						s = s.substring(next + 1);
						break;
					case '\x08':
				// End of gallery
						result += s.substring(0, next + 1);
						if (endings && endings[endings.length - 1] === ch)
							pop_end();
						s = s.substring(next + 1);
						break;
					case '\x03':
					case '[': {
						if (!with_links && !endings) {
							get_out = true;
							break;
						}
					// Image links must be treated specially, since they may contain nested links
					// in the caption!
						var initial = null; // If set, it's 'image:' or 'file:' and we have an image link
						i = next;
						while (i < s.length && s.charAt(i) === ch)
							i++;
						if (ch === '\x03' && i < s.length && s.charAt(i) === '[')
							i++;
						initial = get_initial(i, s);

					// Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
						var lk_text = sanitize(s.substring(i),
							false, // No links at top-level allowed
							caption_level + 1,
							false, // No thumbs
							true, // Break at pipe
							false, // No tables
							false); // No galleries
						var lk_text_length = consumed[caption_level];
						j = i + lk_text_length;
						if (j >= s.length) {
						// Used up the whole text: [[Foo or [bar
							if (initial && allow_only_thumbs) {
							// Should in any case have started with [[, not [

								result += s.substring(0, i - 1) + '\x03:' + initial +
							lk_text.substring(initial.length) + '\x04';
							} else {
								result += s.substring(0, i) + lk_text +
							((s.charAt(i - 1) === '[') ? ']' : '\x04');
							}
							s = '';
							break;
						}
						if (s.charAt(j) === '|')
							k = j;
						else
							k = -1;
						if (k < 0) {
						// No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
							if (initial && allow_only_thumbs) {
							// Should in any case have started with [[, not [
								result += s.substring(0, i - 1) + '\x03:' + initial +
							lk_text.substring(initial.length) + '\x04';
							} else {
								result += s.substring(0, i) + lk_text +
							((s.charAt(i - 1) === '[') ? ']' : '\x04');
							}
							if (s.charAt(j) === ']' || s.charAt(j) === '\x04') {
							// Indeed closing the link
								s = s.substring(j + 1);
							} else {
								s = s.substring(j);
							}
							break;
						} else {
							var caption = null;
							var used = 0;
						// Pipe found.
							if (!initial) {
							// Not an image link. Must be something like [[Foo|Bar]].
								caption = sanitize(
									s.substring(k + 1),
									// No links, please
									false,
									caption_level + 1,
									// No thumbs either
									false,
									// Don't care about pipes
									false,
									// Allow tables (yes, parser allows that!)
									true,
									// Allow galleries (?)
									true);
							// Now we're at [[, [, ]], or ]
								used = consumed[caption_level];
								result += s.substring(0, i) + lk_text + '|' + caption +
							((s.charAt(i - 1) === '[') ? ']' : '\x04');
							} else {
								var q = s.substring(k);
							// We assume that there are no templates, nowikis, and other nasty things
							// in the parameters. Search forward until the next [, {, ], }
								var l = q.search(/[\x01\x02\x03[\x04\]{}\x05\x06\x07\x08]/);
								if (l < 0)
									l = q.length;
								if (l + 1 < q.length)
									q = q.substring(0, l + 1);
								var is_thumb = q.search(/\|\s*thumb(nail)?\s*[|\x04]/) >= 0;
								var img_width = /\|\s*(\d+)px\s*[|\x04]/.exec(q);
								if (img_width && img_width.length > 1) {
									img_width = parseInt(img_width[1], 10);
									if (isNaN(img_width))
										img_width = null;
								} else {
									img_width = null;
								}
								if (!img_width)
									img_width = is_thumb ? 180 : 301;
								var is_small = img_width <= 300;

							// Caption starts at the last pipe before l. If that is a parameter,
							// it doesn't hurt.
								var m = k + q.lastIndexOf('|', l);
								caption = sanitize(
									s.substring(m + 1),
									// Allow links only if it's a thumb
									is_thumb,
									caption_level + 1,
									allow_thumbs && is_thumb,
									// Don't break at pipe
									false,
									// Tables only if it's a thumb
									is_thumb,
									// Allow galleries for thumbs (?)
									is_thumb);
								used = consumed[caption_level];
							// caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04'
								is_thumb = allow_thumbs && is_small;
								if (is_thumb || !allow_only_thumbs) {
									result += s.substring(0, i - 1) + '\x03' + lk_text;
								} else {
									result += s.substring(0, i - 1) + '\x03:' + initial +
								lk_text.substring(initial.length);
								}
								result += s.substring(k, m + 1) + caption + '\x04';
								k = m;
							}
							next = k + 1 + used;
							if (next < s.length) {
								if (s.charAt(next) !== '\x04')
									s = s.substring(next);
								else
									s = s.substring(next + 1);
							} else {
								s = '';
							}
						}
						break;
					}
					case '\x04':
					case ']':
				// Extra bracket.
						result += s.substring(0, next);
						if (!caption_level && !break_at_pipe) {
							result += (ch === ']' ? '&#x5D;' : '&#x5D;&#x5D;');
							s = s.substring(next + 1);
						} else {
							get_out = true;
						}
						break;
					case '|':
						result += s.substring(0, next);
						if (break_at_pipe && !endings) {
					// Pipe character at top level
							get_out = true;
						} else {
							if (!caption_level && !break_at_pipe && !endings)
								result += '&#x7C;'; // Top-level pipe character
							else
								result += '|';
							s = s.substring(next + 1);
						}
						break;
				} // end switch
			} // end while
			if (in_nowiki)
				result += '</nowiki>'; // Make sure this nowiki is closed.
		// Close open templates and tables
			while (endings) {
				ch = pop_end();
				result += (ch === '\x06' ? '\n' : '') + ch;
			}
			if (caption_level > 0) {
				var used_up = initial_length - (get_out ? (s.length - next) : 0);
				if (consumed.length < caption_level)
					consumed.push(used_up);
				else
					consumed[caption_level - 1] = used_up;
			}
			return result;
		}

	// Replace multi-character tokens by one-character placeholders, simplifying the
	// subsequent processing.
		var s = input.replace(/\{\{/g, '\x01')
			.replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end
			.replace(/\}\}/g, '\x02')
			.replace(/\[\[/g, '\x03')
			.replace(/\]\]/g, '\x04')
			.replace(/\n\s*\{\|/g, '\n\x05') // Table start and end must be on own line
			.replace(/^\s*\{\|/, '\x05') // Table start at the very beginning
			.replace(/\n\s*\|\}/g, '\n\x06') // (we strip leading whitespace)
			.replace(/<\s*gallery\s*>/g, '\x07')
			.replace(/<\/\s*gallery\s*>/g, '\x08');

		s = sanitize(s, true, 0, true, false, true, true);
	// with links, allow thumbs, don't break at pipe, allow tables, allow galleries
		return s.replace(/\x01/g, '{{')
			.replace(/\x02/g, '}}')
			.replace(/\x03/g, '[[')
			.replace(/\x04/g, ']]')
			.replace(/\x05/g, '{|')
			.replace(/\x06/g, '|}')
			.replace(/\x07/g, '<gallery>')
			.replace(/\x08/g, '</gallery>');
	}
};
}());
// </nowiki>