File: rexml/text.rb

Overview
Module Structure
Class Hierarchy
Code

Overview

Module Structure

  module: <Toplevel Module>
  module: REXML#7
  class: Text#9
includes
  Comparable ( Builtin-Module )
inherits from
  Child ( REXML )
has properties
constant: SPECIALS #12
constant: SUBSTITUTES #13
constant: SLAICEPS #15
constant: SETUTITSBUS #16
attribute: raw [RW] #19
constant: ILLEGAL #21
constant: NUMERICENTITY #22
method: initialize / 6 #60
method: node_type #96
method: empty? #100
method: clone #105
method: << / 1 #112
method: <=> / 1 #119
constant: REFERENCE #123
method: to_s #137
method: inspect #150
method: value #167
method: value= / 1 #184
method: wrap / 3 #191
method: indent_text / 4 #202
method: write / 4 #217
method: xpath #229
method: write_with_substitution #249
class method: read_with_substitution / 2 #262
constant: EREFERENCE #286
class method: normalize / 3 #288
class method: unnormalize / 4 #310

Class Hierarchy

Object ( Builtin-Module )
Child ( REXML )
  Text    #9

Code

   1  require 'rexml/entity'
   2  require 'rexml/doctype'
   3  require 'rexml/child'
   4  require 'rexml/doctype'
   5  require 'rexml/parseexception'
   6 
   7  module REXML
   8    # Represents text nodes in an XML document
   9    class Text < Child
  10      include Comparable
  11      # The order in which the substitutions occur
  12      SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
  13      SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
  14      # Characters which are substituted in written strings
  15      SLAICEPS = [ '<', '>', '"', "'", '&' ]
  16      SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
  17 
  18      # If +raw+ is true, then REXML leaves the value alone
  19      attr_accessor :raw
  20 
  21      ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
  22      NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ 
  23 
  24      # Constructor
  25      # +arg+ if a String, the content is set to the String.  If a Text,
  26      # the object is shallowly cloned.  
  27      #
  28      # +respect_whitespace+ (boolean, false) if true, whitespace is
  29      # respected
  30      #
  31      # +parent+ (nil) if this is a Parent object, the parent
  32      # will be set to this.  
  33      #
  34      # +raw+ (nil) This argument can be given three values.
  35      # If true, then the value of used to construct this object is expected to 
  36      # contain no unescaped XML markup, and REXML will not change the text. If 
  37      # this value is false, the string may contain any characters, and REXML will
  38      # escape any and all defined entities whose values are contained in the
  39      # text.  If this value is nil (the default), then the raw value of the 
  40      # parent will be used as the raw value for this node.  If there is no raw
  41      # value for the parent, and no value is supplied, the default is false.
  42      # Use this field if you have entities defined for some text, and you don't
  43      # want REXML to escape that text in output.
  44      #   Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
  45      #   Text.new( "&lt;&amp;", false, nil, false ) #-> "&amp;lt;&amp;amp;"
  46      #   Text.new( "<&", false, nil, true )  #-> Parse exception
  47      #   Text.new( "&lt;&amp;", false, nil, true )  #-> "&lt;&amp;"
  48      #   # Assume that the entity "s" is defined to be "sean"
  49      #   # and that the entity    "r" is defined to be "russell"
  50      #   Text.new( "sean russell" )          #-> "&s; &r;"
  51      #   Text.new( "sean russell", false, nil, true ) #-> "sean russell"
  52      #
  53      # +entity_filter+ (nil) This can be an array of entities to match in the
  54      # supplied text.  This argument is only useful if +raw+ is set to false.
  55      #   Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
  56      #   Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
  57      # In the last example, the +entity_filter+ argument is ignored.
  58      #
  59      # +pattern+ INTERNAL USE ONLY
  60      def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 
  61        entity_filter=nil, illegal=ILLEGAL )
  62 
  63        @raw = false
  64 
  65        if parent
  66          super( parent )
  67          @raw = parent.raw 
  68        else
  69          @parent = nil
  70        end
  71 
  72        @raw = raw unless raw.nil?
  73        @entity_filter = entity_filter
  74        @normalized = @unnormalized = nil
  75 
  76        if arg.kind_of? String
  77          @string = arg.clone
  78          @string.squeeze!(" \n\t") unless respect_whitespace
  79        elsif arg.kind_of? Text
  80          @string = arg.to_s
  81          @raw = arg.raw
  82        elsif
  83          raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})"
  84        end
  85 
  86        @string.gsub!( /\r\n?/, "\n" )
  87 
  88        # check for illegal characters
  89        if @raw
  90          if @string =~ illegal
  91            raise "Illegal character '#{$1}' in raw string \"#{@string}\""
  92          end
  93        end
  94      end
  95 
  96      def node_type
  97        :text
  98      end
  99 
 100      def empty?
 101        @string.size==0
 102      end
 103 
 104 
 105      def clone
 106        return Text.new(self)
 107      end
 108 
 109 
 110      # Appends text to this text node.  The text is appended in the +raw+ mode
 111      # of this text node.
 112      def <<( to_append )
 113        @string << to_append.gsub( /\r\n?/, "\n" )
 114      end
 115 
 116 
 117      # +other+ a String or a Text
 118      # +returns+ the result of (to_s <=> arg.to_s)
 119      def <=>( other )
 120        to_s() <=> other.to_s
 121      end
 122 
 123      REFERENCE = /#{Entity::REFERENCE}/
 124      # Returns the string value of this text node.  This string is always
 125      # escaped, meaning that it is a valid XML text node string, and all
 126      # entities that can be escaped, have been inserted.  This method respects
 127      # the entity filter set in the constructor.
 128      #   
 129      #   # Assume that the entity "s" is defined to be "sean", and that the 
 130      #   # entity "r" is defined to be "russell"
 131      #   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
 132      #   t.to_s   #-> "&lt; &amp; &s; russell"
 133      #   t = Text.new( "< & &s; russell", false, nil, false ) 
 134      #   t.to_s   #-> "&lt; &amp; &s; russell"
 135      #   u = Text.new( "sean russell", false, nil, true )
 136      #   u.to_s   #-> "sean russell"
 137      def to_s
 138        return @string if @raw
 139        return @normalized if @normalized
 140 
 141        doctype = nil
 142        if @parent
 143          doc = @parent.document
 144          doctype = doc.doctype if doc
 145        end
 146 
 147        @normalized = Text::normalize( @string, doctype, @entity_filter )
 148      end
 149 
 150      def inspect
 151        @string.inspect
 152      end
 153 
 154      # Returns the string value of this text.  This is the text without
 155      # entities, as it might be used programmatically, or printed to the
 156      # console.  This ignores the 'raw' attribute setting, and any
 157      # entity_filter.
 158      #
 159      #   # Assume that the entity "s" is defined to be "sean", and that the 
 160      #   # entity "r" is defined to be "russell"
 161      #   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
 162      #   t.value   #-> "< & sean russell"
 163      #   t = Text.new( "< & &s; russell", false, nil, false )
 164      #   t.value   #-> "< & sean russell"
 165      #   u = Text.new( "sean russell", false, nil, true )
 166      #   u.value   #-> "sean russell"
 167      def value
 168        @unnormalized if @unnormalized
 169        doctype = nil
 170        if @parent
 171          doc = @parent.document
 172          doctype = doc.doctype if doc
 173        end
 174        @unnormalized = Text::unnormalize( @string, doctype )
 175      end
 176 
 177      # Sets the contents of this text node.  This expects the text to be 
 178      # unnormalized.  It returns self.
 179      #
 180      #   e = Element.new( "a" )
 181      #   e.add_text( "foo" )   # <a>foo</a>
 182      #   e[0].value = "bar"    # <a>bar</a>
 183      #   e[0].value = "<a>"    # <a>&lt;a&gt;</a>
 184      def value=( val )
 185        @string = val.gsub( /\r\n?/, "\n" )
 186        @unnormalized = nil
 187        @normalized = nil
 188        @raw = false
 189      end
 190   
 191       def wrap(string, width, addnewline=false)
 192         # Recursively wrap string at width.
 193         return string if string.length <= width
 194         place = string.rindex(' ', width) # Position in string with last ' ' before cutoff
 195         if addnewline then
 196           return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width)
 197         else
 198           return string[0,place] + "\n" + wrap(string[place+1..-1], width)
 199         end
 200       end
 201 
 202      def indent_text(string, level=1, style="\t", indentfirstline=true)
 203        return string if level < 0
 204        new_string = ''
 205        string.each { |line|
 206          indent_string = style * level
 207          new_line = (indent_string + line).sub(/[\s]+$/,'')
 208          new_string << new_line
 209        }
 210        new_string.strip! unless indentfirstline
 211        return new_string
 212      end
 213   
 214      # == DEPRECATED
 215      # See REXML::Formatters
 216      #
 217      def write( writer, indent=-1, transitive=false, ie_hack=false ) 
 218        Kernel.warn("#{self.class.name}.write is deprecated.  See REXML::Formatters")
 219        formatter = if indent > -1
 220            REXML::Formatters::Pretty.new( indent )
 221          else
 222            REXML::Formatters::Default.new
 223          end
 224        formatter.write( self, writer )
 225      end
 226 
 227      # FIXME
 228      # This probably won't work properly
 229      def xpath
 230        path = @parent.xpath
 231        path += "/text()"
 232        return path
 233      end
 234 
 235      # Writes out text, substituting special characters beforehand.
 236      # +out+ A String, IO, or any other object supporting <<( String )
 237      # +input+ the text to substitute and the write out
 238      #
 239      #   z=utf8.unpack("U*")
 240      #   ascOut=""
 241      #   z.each{|r|
 242      #     if r <  0x100
 243      #       ascOut.concat(r.chr)
 244      #     else
 245      #       ascOut.concat(sprintf("&#x%x;", r))
 246      #     end
 247      #   }
 248      #   puts ascOut
 249      def write_with_substitution out, input
 250        copy = input.clone
 251        # Doing it like this rather than in a loop improves the speed
 252        copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
 253        copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
 254        copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
 255        copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
 256        copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
 257        copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
 258        out << copy
 259      end
 260 
 261      # Reads text, substituting entities
 262      def Text::read_with_substitution( input, illegal=nil )
 263        copy = input.clone
 264 
 265        if copy =~ illegal
 266          raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
 267        end if illegal
 268        
 269        copy.gsub!( /\r\n?/, "\n" )
 270        if copy.include? ?&
 271          copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
 272          copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
 273          copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
 274          copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
 275          copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
 276          copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m|
 277            m=$1
 278            #m='0' if m==''
 279            m = "0#{m}" if m[0] == ?x
 280            [Integer(m)].pack('U*')
 281          }
 282        end
 283        copy
 284      end
 285 
 286      EREFERENCE = /&(?!#{Entity::NAME};)/
 287      # Escapes all possible entities
 288      def Text::normalize( input, doctype=nil, entity_filter=nil )
 289        copy = input.to_s
 290        # Doing it like this rather than in a loop improves the speed
 291        #copy = copy.gsub( EREFERENCE, '&amp;' )
 292        copy = copy.gsub( "&", "&amp;" )
 293        if doctype
 294          # Replace all ampersands that aren't part of an entity
 295          doctype.entities.each_value do |entity|
 296            copy = copy.gsub( entity.value, 
 297              "&#{entity.name};" ) if entity.value and 
 298                not( entity_filter and entity_filter.include?(entity) )
 299          end
 300        else
 301          # Replace all ampersands that aren't part of an entity
 302          DocType::DEFAULT_ENTITIES.each_value do |entity|
 303            copy = copy.gsub(entity.value, "&#{entity.name};" )
 304          end
 305        end
 306        copy
 307      end
 308 
 309      # Unescapes all possible entities
 310      def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
 311        rv = string.clone
 312        rv.gsub!( /\r\n?/, "\n" )
 313        matches = rv.scan( REFERENCE )
 314        return rv if matches.size == 0
 315        rv.gsub!( NUMERICENTITY ) {|m|
 316          m=$1
 317          m = "0#{m}" if m[0] == ?x
 318          [Integer(m)].pack('U*')
 319        }
 320        matches.collect!{|x|x[0]}.compact!
 321        if matches.size > 0
 322          if doctype
 323            matches.each do |entity_reference|
 324              unless filter and filter.include?(entity_reference)
 325                entity_value = doctype.entity( entity_reference )
 326                re = /&#{entity_reference};/
 327                rv.gsub!( re, entity_value ) if entity_value
 328              end
 329            end
 330          else
 331            matches.each do |entity_reference|
 332              unless filter and filter.include?(entity_reference)
 333                entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ]
 334                re = /&#{entity_reference};/
 335                rv.gsub!( re, entity_value.value ) if entity_value
 336              end
 337            end
 338          end
 339          rv.gsub!( /&amp;/, '&' )
 340        end
 341        rv
 342      end
 343    end
 344  end