MediaWiki:TextCleaner.js

From Open Educational Resources
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Note: After saving, you may have to bypass your browser's cache to see the changes.

  • Firefox / Safari: Hold Shift while clicking Reload, or press either Ctrl-F5 or Ctrl-R (⌘-R on a Mac)
  • Google Chrome: Press Ctrl-Shift-R (⌘-Shift-R on a Mac)
  • Internet Explorer / Edge: Hold Ctrl while clicking Refresh, or press Ctrl-F5
  • Opera: Go to Menu → Settings (Opera → Preferences on a Mac) and then to Privacy & security → Clear browsing data → Cached images and files.
// <source lang="javascript">
/*
  Wikitext sanitation for MediaWiki

  Author: [[User:Lupo]], January 2008
  License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)

  Choose whichever license of these you like best :-)
*/
/*jshint curly:false, eqnull:true, laxbreak:true */
var TextCleaner = {

  imgNamespaceNames : null,

  // This function attempts to construct well-formed wikitext from input that may contain
  // possibly broken wikitext.
  //
  // Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
  // of templates, and due to the fact that image thumbnail captions may themselves contain
  // links. This implementation catches the most common errors (such as forgetting to close a
  // template or a link), and even some more elaborate ones. With enough malice, this sanitation
  // can still be broken by user input such that the result is not well-formed wikitext as the
  // parser at the servers would like to have it. (It's still possible that the result is broken
  // wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
  // into broken wikitext.)
  //
  // If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
  // image link was a thumbnail or had a width smaller than 300px specified.
  //
  // WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
  // probably rather inefficient due to the many substrings that are generated. This function is
  // primarily intended to be used to clean up user input in forms, which are typically rather
  // short.
  sanitizeWikiText : function (input, only_thumbs) {
    if (input.search(/[\][}{]|<nowiki(\s[^>]*)?>|<\!--/) < 0) return input;
    // No critical characters

    if (!TextCleaner.imgNamespaceNames) {
      TextCleaner.imgNamespaceNames = [];
      var namespaceIds = mw.config.get('wgNamespaceIds');
      if (namespaceIds) {
        for (var name in namespaceIds) {
          if (namespaceIds[name] == 6) // Image namespace
            TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = name;
        }
      }
      // Make sure that we have the two canonical names
      TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = 'Image';
      TextCleaner.imgNamespaceNames[TextCleaner.imgNamespaceNames.length] = 'File';
      // If your Wiki does not have wgNamespaceIds, add aliases or localized namespace names here!
    }

    var consumed       = [0, 0];
    // For image captions. Image caption may contain links, and may even contain images.
    // The current MediaWiki parser actually allows this only once. For deeper recursions,
    // it fails. But here, it's actually easier to implement no limit.

    var base_regexp    =
      new RegExp
            (   "[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]"
              + "|\<nowiki(\\s[^>]*)?\>|\<\!--",
            "i"); // Ignore case
    var nowiki_regexp  = new RegExp("\<\\/nowiki(\\s[^>]*)?\>|\<\!--", "i");

    var allow_only_thumbs = only_thumbs;

    function sanitize
      (s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries) {
      if (!s || s.length === 0) {
        if (caption_level > 0) {
          if (consumed.length < caption_level)
            consumed.push(0);
          else
            consumed[caption_level-1] = 0;
        }
        return s;
      }

      var result         = "";
      var initial_length = s.length;
      var get_out        = false;
      var in_nowiki      = false;
      var endings        = null;
      // Stack recording template and table nesting
      var next;

      function push_end (val) {
        if (endings == null) {
          endings = new Array (1);
          endings[0] = val;
        } else {
          endings[endings.length] = val;
        }
      }

      function pop_end () {
        if (endings == null) return null; // Shouldn't happen
        var result;
        if (endings.length == 1) {
          result = endings[0];
          endings = null;
        } else {
          result = endings[endings.length -1];
          endings.length = endings.length - 1;
        }
        return result;
      }

      regexp = base_regexp;
      while (s.length > 0 && !get_out) {
        next = s.search(regexp);

        if (next < 0) {
          result = result + s;
          break;
        }
        var ch = s.charAt(next);
        var i  = -1;
        var j  = -1;
        var k  = -1;
        switch (ch) {
          case '<':
            // Nowiki or HTML comment. Must be closed.
            if (s.charAt(next+1) == '!') {
              // HTML comment. Cannot be nested.
              i = s.indexOf('--\>', next + 3);
              if (i < 0) {
                result = result + s + '--\>';
                s = "";
              } else {
                result = result + s.substring(0, i + 3);
                s = s.substring(i + 3);
              }
            } else if (s.charAt(next + 1) == 'n') {
              // Nowiki may contain HTML comments!
              in_nowiki = true;
              regexp = nowiki_regexp;
              result = result + s.substring(0, next + 7);
              s = s.substring(next + 7);
            } else {
              // End of nowiki. Searched for and found only if in_nowiki == true
              in_nowiki = false;
              regexp = base_regexp;
              i = s.indexOf('>', next+1); // End of tag
              result = result + s.substring(0, i+1);
              s = s.substring(i+1);
            }
            break;
          case '\x05':
            // Table start
            if (!with_tables) {
              result  = result + s.substring(0, next);
              get_out = true;
              break;
            }
            // Fall through
          case '\x07':
            if (ch == '\x07' && !with_galleries) {
              result = result + s.substring(0, next);
              get_out = true;
              break;
            }
          case '\x01':
            // Start of template, table, or gallery
            result = result + s.substring(0, next+1);
            push_end (String.fromCharCode(ch.charCodeAt(0)+1).charAt(0));
            s = s.substring(next+1);
            break;
          case '\x06':
            // Table end
            if (break_at_pipe && endings == null) {
              result = result + s.substring(0, next);
              get_out = true;
              break;
            }
            // Fall through
          case '\x02':
            // End of a template or table
            result = result + s.substring(0, next);
            if (endings == null || endings[endings.length - 1] != ch) {
              // Spurious template or table end
              if (ch == '\x02')
                result = result + '&#x7D;&#x7D;';
              else
                result = result + '&#x7C;&#x7D;';
            } else {
              result = result + pop_end ();
            }
            s = s.substring(next+1);
            break;
          case '\x08':
            // End of gallery
            result = result + s.substring(0, next+1);
            if (endings != null && endings[endings.length - 1] == ch) pop_end ();
            s = s.substring(next+1);
            break;
          case '\x03':
          case '[':
            {
              if (!with_links && endings == null) {
                get_out = true;
                break;
              }
              // Image links must be treated specially, since they may contain nested links
              // in the caption!
              var initial = null;  // If set, it's 'image:' or 'file:' and we have an image link
              i = next;
              while (i < s.length && s.charAt(i) == ch) i++;
              if (ch == '\x03' && i < s.length && s.charAt(i) == '[') i++;
              function get_initial(i, s) {
                for (var j = 0; j < TextCleaner.imgNamespaceNames.length; j++) {
                  if (s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1) {
                    var t = s.substr(i, TextCleaner.imgNamespaceNames[j].length + 1);
                    if (t.toLowerCase() == (TextCleaner.imgNamespaceNames[j].toLowerCase() + ':'))
                      return t;
                  }
                }
                return null;
              }
              initial = get_initial (i, s);

              // Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
              var lk_text = sanitize (s.substring(i),
                                      false,           // No links at top-level allowed
                                      caption_level + 1,
                                      false,           // No thumbs
                                      true,            // Break at pipe
                                      false,           // No tables
                                      false);          // No galleries
              var lk_text_length = consumed[caption_level];
              j = i + lk_text_length;
              if (j >= s.length) {
                // Used up the whole text: [[Foo or [bar
                if (initial != null && allow_only_thumbs)
                  // Should in any case have started with [[, not [
                  result = result + s.substring(0, i-1) + '\x03:' + initial
                         + lk_text.substring(initial.length) + '\x04';
                else
                  result = result + s.substring(0, i) + lk_text
                         + ((s.charAt(i-1) == '[') ? ']' : '\x04');
                s = "";
                break;
              }
              if (s.charAt(j) == '|') k = j; else k = -1;
              if (k < 0) {
                // No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
                if (initial != null && allow_only_thumbs)
                  // Should in any case have started with [[, not [
                  result = result + s.substring(0, i-1) + '\x03:' + initial
                         + lk_text.substring(initial.length) + '\x04';
                else
                  result = result + s.substring(0, i) + lk_text
                         + ((s.charAt(i-1) == '[') ? ']' : '\x04');
                if (s.charAt(j) == ']' || s.charAt(j) == '\x04') {
                  // Indeed closing the link
                  s = s.substring(j+1);
                } else {
                  s = s.substring(j);
                }
                break;
              } else {
                var caption = null;
                var used    = 0;
                // Pipe found.
                if (initial == null) {
                  // Not an image link. Must be something like [[Foo|Bar]].
                  caption = sanitize(
                    s.substring(k + 1),
                    // No links, please
                    false,
                    caption_level + 1,
                    // No thumbs either
                    false,
                    // Don't care about pipes
                    false,
                    // Allow tables (yes, parser allows that!)
                    true,
                    // Allow galleries (?)
                    true
                  );
                  // Now we're at [[, [, ]], or ]
                  used = consumed[caption_level];
                  result = result + s.substring(0, i) + lk_text + '|' + caption
                         + ((s.charAt(i-1) == '[') ? ']' : '\x04');
                } else {
                  var q = s.substring(k);
                  // We assume that there are no templates, nowikis, and other nasty things
                  // in the parameters. Search forward until the next [, {, ], }
                  l = q.search(/[\x01\x02\x03[\x04\]\{\}\x05\x06\x07\x08]/);
                  if (l < 0) l = q.length;
                  if (l+1 < q.length) q = q.substring(0, l+1);
                  var is_thumb = q.search(/\|\s*thumb(nail)?\s*[\|\x04]/) >= 0;
                  var img_width = /\|\s*(\d+)px\s*[\|\x04]/.exec(q);
                  if (img_width && img_width.length > 1) {
                    img_width = parseInt (img_width[1], 10);
                    if (isNaN (img_width)) img_width = null;
                  } else
                    img_width = null;
                  if (img_width === null) img_width = is_thumb ? 180 : 301;
                  var is_small = img_width <= 300;

                  // Caption starts at the last pipe before l. If that is a parameter,
                  // it doesn't hurt.
                  var m = k + q.lastIndexOf('|', l);
                  caption = sanitize(
                    s.substring(m+1),
                    // Allow links only if it's a thumb
                    is_thumb,
                    caption_level + 1,
                    allow_thumbs && is_thumb,
                    // Don't break at pipe
                    false,
                    // Tables only if it's a thumb
                    is_thumb,
                    // Allow galleries for thumbs (?)
                    is_thumb
                  );
                  used = consumed[caption_level];
                  // caption used 'used' chars from m+1, s.charAt(m+1+used) == '\x04'
                  is_thumb = allow_thumbs && is_small;
                  if (is_thumb || !allow_only_thumbs)
                    result = result + s.substring(0, i-1) + '\x03' + lk_text ;
                  else
                    result = result + s.substring(0, i-1) + '\x03:' + initial
                           + lk_text.substring(initial.length);
                  result = result + s.substring(k, m+1) + caption + '\x04';
                  k = m;
                }
                next = k+1+used;
                if (next < s.length) {
                  if (s.charAt(next) != '\x04')
                    s = s.substring(next);
                  else
                    s = s.substring(next+1);
                } else
                  s = "";
              }
              break;
            }
          case '\x04':
          case ']':
            // Extra bracket.
            result = result + s.substring(0, next);
            if (caption_level === 0 && !break_at_pipe) {
              result = result + (ch == ']' ? '&#x5D;' : '&#x5D;&#x5D;');
              s = s.substring(next+1);
            } else
              get_out = true;
            break;
          case '|':
            result = result + s.substring(0, next);
            if (break_at_pipe && endings == null) {
              // Pipe character at top level
              get_out = true;
            } else {
              if (caption_level === 0 && !break_at_pipe && endings == null)
                result = result + '&#x7C;'; // Top-level pipe character
              else
                result = result + '|';
              s = s.substring(next+1);
            }
            break;
        } // end switch
      } // end while
      if (in_nowiki) result = result + "\<\/nowiki>"; // Make sure this nowiki is closed.
      // Close open templates and tables
      while (endings != null) {
        ch = pop_end();
        result = result + (ch == '\x06' ? '\n' : "") + ch;
      }
      if (caption_level > 0) {
        var used_up = initial_length - (get_out ? (s.length - next) : 0);
        if (consumed.length < caption_level)
          consumed[consumed.length] = used_up;
        else
          consumed[caption_level-1] = used_up;
      }
      return result;
    }

    // Replace multi-character tokens by one-character placeholders, simplifying the
    // subsequent processing.
    var s = input.replace(/\{\{/g, '\x01')
                 .replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end
                 .replace(/\}\}/g, '\x02')
                 .replace(/\[\[/g, '\x03')
                 .replace(/\]\]/g, '\x04')
                 .replace(/\n\s*\{\|/g, '\n\x05')       // Table start and end must be on own line
                 .replace(/^\s*\{\|/, '\x05')           // Table start at the very beginning
                 .replace(/\n\s*\|\}/g, '\n\x06')       // (we strip leading whitespace)
                 .replace(/\<\s*gallery\s*\>/g, '\x07')
                 .replace(/\<\/\s*gallery\s*\>/g, '\x08');

    s = sanitize(s, true, 0, true, false, true, true);
    // with links, allow thumbs, don't break at pipe, allow tables, allow galleries
    return s.replace(/\x01/g, '\{\{')
            .replace(/\x02/g, '\}\}')
            .replace(/\x03/g, '\[\[')
            .replace(/\x04/g, '\]\]')
            .replace(/\x05/g, '\{\|')
            .replace(/\x06/g, '\|\}')
            .replace(/\x07/g, '<gallery>')
            .replace(/\x08/g, '</gallery>');
  }
};

// </source>