Created: Wed May 14 23:07:26 UTC 2008
#!/usr/bin/env ruby

# $Id: ya2yaml.rb,v 0.26 2007-01-19 20:42:42+09 funai Exp funai $
#
# Author::    Akira FUNAI
# Copyright:: Copyright (c) 2006 Akira FUNAI
# License::   MIT License

class Ya2YAML

	def initialize(options = {})
		options[:indent_size] = 2          if options[:indent_size].to_i <= 0
		options[:minimum_block_length] = 0 if options[:minimum_block_length].to_i <= 0
		options.update(
			{
				:printable_with_syck  => true,
				:escape_b_specific    => true,
				:escape_as_utf8       => true,
			}
		) if options[:syck_compatible]

		@options = options
	end

	def _ya2yaml(obj)
		throw 'set $KCODE to "UTF8".' if $KCODE != 'UTF8'
		'--- ' + emit(obj,1) + "\n"
	end

	private

	def emit(obj,level)
		case obj.class.to_s
			when 'Array'
				if (obj.length == 0)
					'[]'
				else
					indent = "\n" + s_indent(level - 1)
					obj.collect {|o|
						indent + '- ' + emit(o,level + 1)
					}.join('')
				end
			when 'Hash'
				if (obj.length == 0)
					'{}'
				else
					indent = "\n" + s_indent(level - 1)
					hash_order = @options[:hash_order]
					if (hash_order && level == 1)
						hash_keys = obj.keys.sort {|x,y|
							x_order = hash_order.index(x) ? hash_order.index(x) : Float::MAX
							y_order = hash_order.index(y) ? hash_order.index(y) : Float::MAX
							o = (x_order <=> y_order)
							(o != 0) ? o : (x.to_s <=> y.to_s)
						}
					else
						hash_keys = obj.keys.sort {|x,y| x.to_s <=> y.to_s }
					end
					hash_keys.collect {|k|
						key = emit(k,level + 1)
						if (
							is_one_plain_line?(key) ||
							key =~ /\A(#{REX_BOOL}|#{REX_FLOAT}|#{REX_INT}|#{REX_NULL})\z/x
						)
							indent + key + ': ' + emit(obj[k],level + 1)
						else
							indent + '? ' + key +
							indent + ': ' + emit(obj[k],level + 1)
						end
					}.join('')
				end
			when 'NilClass'
				'~'
			when 'String'
				emit_string(obj,level)
			when 'TrueClass','FalseClass'
				obj.to_s
			when 'Fixnum','Bignum','Float'
				obj.to_s
			when 'Date'
				obj.to_s
			when 'Time'
				offset = obj.gmtoff
				off_hm = sprintf(
					'%+.2d:%.2d',
					(offset / 3600.0).to_i,
					(offset % 3600.0) / 60
				)
				u_sec = (obj.usec != 0) ? sprintf(".%.6d",obj.usec) : ''
				obj.strftime("%Y-%m-%d %H:%M:%S#{u_sec} #{off_hm}")
			when 'Symbol'
				'!ruby/symbol ' + obj.to_s
			when 'Range'
				'!ruby/range ' + obj.to_s
			when 'Regexp'
				'!ruby/regexp ' + obj.inspect
			else
				case
					when obj.is_a?(Struct)
						struct_members = {}
						obj.each_pair{|k,v| struct_members[k.to_s] = v }
						'!ruby/struct:' + obj.class.to_s.sub(/^(Struct::(.+)|.*)$/,'\2') + ' ' +
						emit(struct_members,level + 1)
					else
						# serialized as a generic object
						object_members = {}
						obj.instance_variables.each{|k,v|
							object_members[k.sub(/^@/,'')] = obj.instance_variable_get(k)
						}
						'!ruby/object:' + obj.class.to_s + ' ' +
						emit(object_members,level + 1)
				end
		end
	end

	def emit_string(str,level)
		(is_string,is_printable,is_one_line,is_one_plain_line) = string_type(str)
		if is_string
			if is_printable
				if is_one_plain_line
					emit_simple_string(str,level)
				else
					(is_one_line || str.length < @options[:minimum_block_length]) ?
						emit_quoted_string(str,level) :
						emit_block_string(str,level)
				end
			else
				emit_quoted_string(str,level)
			end
		else
			emit_base64_binary(str,level)
		end
	end

	def emit_simple_string(str,level)
		str
	end

	def emit_block_string(str,level)
		str = normalize_line_break(str)

		indent = s_indent(level)
		indentation_indicator = (str =~ /\A /) ? indent.size.to_s : ''
		str =~ /(#{REX_NORMAL_LB}*)\z/
		chomping_indicator = case $1.length
			when 0
				'-'
			when 1
				''
			else
				'+'
		end

		str.chomp!
		str.gsub!(/#{REX_NORMAL_LB}/) {
			$1 + indent
		}
		'|' + indentation_indicator + chomping_indicator + "\n" + indent + str
	end

	def emit_quoted_string(str,level)
		str = yaml_escape(normalize_line_break(str))
		if (str.length < @options[:minimum_block_length])
			str.gsub!(/#{REX_NORMAL_LB}/) { ESCAPE_SEQ_LB[$1] }
		else
			str.gsub!(/#{REX_NORMAL_LB}$/) { ESCAPE_SEQ_LB[$1] }
			str.gsub!(/(#{REX_NORMAL_LB}+)(.)/) {
				trail_c = $3
				$1 + trail_c.sub(/([\t ])/) { ESCAPE_SEQ_WS[$1] }
			}
			indent = s_indent(level)
			str.gsub!(/#{REX_NORMAL_LB}/) {
				ESCAPE_SEQ_LB[$1] + "\\\n" + indent
			}
		end
		'"' + str + '"'
	end

	def emit_base64_binary(str,level)
		indent = "\n" + s_indent(level)
		base64 = [str].pack('m')
		'!binary |' + indent + base64.gsub(/\n(?!\z)/,indent)
	end

	def string_type(str)
		(ucs_codes = str.unpack('U*')) rescue (
			# ArgumentError -> binary data
			return false,false,false,false
		)
		if (
			@options[:printable_with_syck] &&
			str =~ /\A#{REX_ANY_LB}* | #{REX_ANY_LB}*\z|#{REX_ANY_LB}{2}\z/
		)
			# detour Syck bug
			return true,false,is_one_line?(str),false
		end
		ucs_codes.each {|ucs_code|
			return true,false,is_one_line?(str),false unless is_printable?(ucs_code)
		}
		return true,true,is_one_line?(str),is_one_plain_line?(str)
	end

	def is_printable?(ucs_code)
		# YAML 1.1 / 4.1.1.
		(
			[0x09,0x0a,0x0d,0x85].include?(ucs_code)      ||
			(ucs_code <=     0x7e && ucs_code >=    0x20) ||
			(ucs_code <=   0xd7ff && ucs_code >=    0xa0) ||
			(ucs_code <=   0xfffd && ucs_code >=  0xe000) ||
			(ucs_code <= 0x10ffff && ucs_code >= 0x10000)
		) &&
		!(
			# treat LS/PS as non-printable characters
			@options[:escape_b_specific] &&
			(ucs_code == 0x2028 || ucs_code == 0x2029)
		)
	end

	def is_one_line?(str)
		str !~ /#{REX_ANY_LB}(?!\z)/
	end

	def is_one_plain_line?(str)
		# YAML 1.1 / 4.6.11.
		str !~ /^([\-\?:,\[\]\{\}\#&\*!\|>'"%@`\s]|---|\.\.\.)/    &&
		str !~ /[:\#\s\[\]\{\},]/                                  &&
		str !~ /#{REX_ANY_LB}/                                     &&
		str !~ /^(#{REX_BOOL}|#{REX_FLOAT}|#{REX_INT}|#{REX_MERGE}
			|#{REX_NULL}|#{REX_TIMESTAMP}|#{REX_VALUE})$/x
	end

	def s_indent(level)
		# YAML 1.1 / 4.2.2.
		' ' * (level * @options[:indent_size])
	end

	def normalize_line_break(str)
		# YAML 1.1 / 4.1.4.
		str.gsub(/(#{REX_CRLF}|#{REX_CR}|#{REX_NEL})/,"\n")
	end

	def yaml_escape(str)
		# YAML 1.1 / 4.1.6.
		str.gsub(/[^a-zA-Z0-9]/u) {|c|
			ucs_code, = (c.unpack('U') rescue [??])
			case
				when ESCAPE_SEQ[c]
					ESCAPE_SEQ[c]
				when is_printable?(ucs_code)
					c
				when @options[:escape_as_utf8]
					'\\x' + c.unpack('H2' * c.size).join('\\x')
				when ucs_code == 0x2028 || ucs_code == 0x2029
					ESCAPE_SEQ_LB[c]
				when ucs_code <= 0x7f
					sprintf('\\x%.2x',ucs_code)
				when ucs_code <= 0xffff
					sprintf('\\u%.4x',ucs_code)
				else
					sprintf('\\U%.8x',ucs_code)
			end
		}
	end

	module Constants
		UCS_0X85   = [0x85].pack('U')   #   c285@UTF8 Unicode next line
		UCS_0XA0   = [0xa0].pack('U')   #   c2a0@UTF8 Unicode non-breaking space
		UCS_0X2028 = [0x2028].pack('U') # e280a8@UTF8 Unicode line separator
		UCS_0X2029 = [0x2029].pack('U') # e280a9@UTF8 Unicode paragraph separator

		# non-break characters
		ESCAPE_SEQ = {
			"\x00" => '\\0',
			"\x07" => '\\a',
			"\x08" => '\\b',
			"\x0b" => '\\v',
			"\x0c" => '\\f',
			"\x1b" => '\\e',
			"\""   => '\\"',
			"\\"   => '\\\\',
		}

		# non-breaking space
		ESCAPE_SEQ_NS = {
			UCS_0XA0 => '\\_',
		}

		# white spaces
		ESCAPE_SEQ_WS = {
			"\x09" => '\\t',
			" "    => '\\x20',
		}

		# line breaks
		ESCAPE_SEQ_LB ={
			"\x0a"     => '\\n',
			"\x0d"     => '\\r',
			UCS_0X85   => '\\N',
			UCS_0X2028 => '\\L',
			UCS_0X2029 => '\\P',
		}

		# regexps for line breaks
		REX_LF   = Regexp.escape("\x0a")
		REX_CR   = Regexp.escape("\x0d")
		REX_CRLF = Regexp.escape("\x0d\x0a")
		REX_NEL  = Regexp.escape(UCS_0X85)
		REX_LS   = Regexp.escape(UCS_0X2028)
		REX_PS   = Regexp.escape(UCS_0X2029)

		REX_ANY_LB    = /(#{REX_LF}|#{REX_CR}|#{REX_NEL}|#{REX_LS}|#{REX_PS})/
		REX_NORMAL_LB = /(#{REX_LF}|#{REX_LS}|#{REX_PS})/

		# regexps for language-Independent types for YAML1.1
		REX_BOOL = /
			 y|Y|yes|Yes|YES|n|N|no|No|NO
			|true|True|TRUE|false|False|FALSE
			|on|On|ON|off|Off|OFF
		/x
		REX_FLOAT = /
			 [-+]?([0-9][0-9_]*)?\.[0-9.]*([eE][-+][0-9]+)? # (base 10)
			|[-+]?[0-9][0-9_]*(:[0-5]?[0-9])+\.[0-9_]*      # (base 60)
			|[-+]?\.(inf|Inf|INF)                           # (infinity)
			|\.(nan|NaN|NAN)                                # (not a number)
		/x
		REX_INT = /
			 [-+]?0b[0-1_]+                   # (base 2)
			|[-+]?0[0-7_]+                    # (base 8)
			|[-+]?(0|[1-9][0-9_]*)            # (base 10)
			|[-+]?0x[0-9a-fA-F_]+             # (base 16)
			|[-+]?[1-9][0-9_]*(:[0-5]?[0-9])+ # (base 60)
		/x
		REX_MERGE = /
			<<
		/x
		REX_NULL = /
			 ~              # (canonical)
			|null|Null|NULL # (English)
			|               # (Empty)
		/x
		REX_TIMESTAMP = /
			 [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9] # (ymd)
			|[0-9][0-9][0-9][0-9]                       # (year)
			 -[0-9][0-9]?                               # (month)
			 -[0-9][0-9]?                               # (day)
			 ([Tt]|[ \t]+)[0-9][0-9]?                   # (hour)
			 :[0-9][0-9]                                # (minute)
			 :[0-9][0-9]                                # (second)
			 (\.[0-9]*)?                                # (fraction)
			 (([ \t]*)Z|[-+][0-9][0-9]?(:[0-9][0-9])?)? # (time zone)
		/x
		REX_VALUE = /
			=
		/x
	end

	include Constants

end

class Object
	def ya2yaml(options = {})
		Ya2YAML.new(options)._ya2yaml(self)
	end
end

__END__