From 2a738f528cb87793ff7f8312099666af1e21f44c Mon Sep 17 00:00:00 2001 From: Ralph Amissah Date: Wed, 3 Sep 2008 22:25:03 -0400 Subject: xml character encoding adjusted; xml image match; odf issue with '@' symbol in url --- lib/sisu/v0/character_encoding.rb | 4 +++- lib/sisu/v0/odf.rb | 7 +++++-- lib/sisu/v0/param.rb | 42 +++++++++++++++++++-------------------- lib/sisu/v0/shared_xml.rb | 29 +++++++++++++++++++++++++++ lib/sisu/v0/xhtml.rb | 2 +- lib/sisu/v0/xml.rb | 2 +- lib/sisu/v0/xml_dom.rb | 2 +- 7 files changed, 61 insertions(+), 27 deletions(-) diff --git a/lib/sisu/v0/character_encoding.rb b/lib/sisu/v0/character_encoding.rb index 60c2f335..aa856cdd 100644 --- a/lib/sisu/v0/character_encoding.rb +++ b/lib/sisu/v0/character_encoding.rb @@ -374,7 +374,9 @@ module SiSU_character_encode ['ü', 252, '374', '\303\274', 'ü', 'ü', 'ü', '\"{u}', '', 'Small u, umlaut ü' ], ['ý', 253, '375', '\303\275', 'ý', 'ý', 'ý', '', '', 'Small y, acute accent ý' ], ['þ', 254, '376', '\303\276', 'þ', 'þ', 'þ', '', '', 'Small thorn, Icelandic þ' ], - ['ÿ', 255, '377', '\303\277', 'ÿ', 'ÿ', 'ÿ', '', '', 'Smally y, umlaut ÿ' ] + ['ÿ', 255, '377', '\303\277', 'ÿ', 'ÿ', 'ÿ', '', '', 'Smally y, umlaut ÿ' ], + ['∝', , '', '', '∝', '∝', '∝', '', '', 'proportional to U+221D (8733) ∝' ], + ['∞', , '', '', '∞', '∞', '∞', '', '', 'infinity U+221E (8734) ∞' ], ] end end diff --git a/lib/sisu/v0/odf.rb b/lib/sisu/v0/odf.rb index 6b1491c2..bf23f91f 100644 --- a/lib/sisu/v0/odf.rb +++ b/lib/sisu/v0/odf.rb @@ -267,7 +267,7 @@ module SiSU_ODF #para.gsub!(/\b((?:https?|ftp):\/\/\S+?\.[^'"><\s]+?)([;.,]?(?:\s|$))/, also works #%{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}\\2}) #http ftp matches with decoration para.gsub!(/([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+)/, - %{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) + %{#{@url_brace.xml_open}\\1#{@url_brace.xml_close}}) if para !~/http:\/\// # improve upon, document crash where url contains '@' symbol para=case para when /^#{Mx[:pa_o]}:i([1-9])#{Mx[:pa_c]}/m m=$1 @@ -443,6 +443,7 @@ module SiSU_ODF #para.gsub!(/<(~\d+;(?:\w|[0-6]:)\d+;\w\d+)><(#@dp:#@dp)>/,'<\1><\2>') para='' if para =~/#{Mx[:lv_o]}\d+:.*?#{Mx[:lv_c]}.+?#{Mx[:pa_non_object_dummy_heading]}/ para_array=[] + para.gsub!(//,'>') word=para.scan(/\S+|\n/) if word word.each do |w| # _ - / # | : ! ^ ~ @@ -487,8 +488,10 @@ module SiSU_ODF para.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') para.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') para.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') - para.gsub!(/`/,"'") + para.gsub!(/[`’]/,"'") para.gsub!(/­/u,'-') + para.gsub!(/ /u, ' ') # space identify + para.gsub!(/ /u, ' ') # space identify para.gsub!(/·/u,'*') para.gsub!(/[“”]/u,'""') para.gsub!(/[­–—]/u,'-') #— – chk diff --git a/lib/sisu/v0/param.rb b/lib/sisu/v0/param.rb index 3cfbf1e5..b211f5c1 100644 --- a/lib/sisu/v0/param.rb +++ b/lib/sisu/v0/param.rb @@ -368,32 +368,32 @@ module SiSU_Param @dc_date_modified=date @date_modified_scheme='scheme="ISO-8601"' if date =~/\d{4}-\d{2}-\d{2}/ end - when /^(?:0~type|@type:)\s+(.+?)$/m; @dc_type=$1 #% metainfo DC - when /^(?:0~format|@format:)\s+(.+?)$/m; @dc_format=$1 #% metainfo DC - #when /^(?:0~identifier|@identifier:)\s+(.+?)$/m; @dc_identifier=$1 #% metainfo DC - when /^(?:0~source|@source:)\s+(.+?)$/m; @dc_source=$1 #% metainfo DC - when /^(?:0~language(?:\.document)?|@language(?:\.document)?:)\s+(.+?)$/m #% metainfo DC + when /^(?:0~type|@type:)\s+(.+?)$/m; @dc_type=$1 #% metainfo DC + when /^(?:0~format|@format:)\s+(.+?)$/m; @dc_format=$1 #% metainfo DC + #when /^(?:0~identifier|@identifier:)\s+(.+?)$/m; @dc_identifier=$1 #% metainfo DC + when /^(?:0~source|@source:)\s+(.+?)$/m; @dc_source=$1 #% metainfo DC + when /^(?:0~language(?:\.document)?|@language(?:\.document)?:)\s+(.+?)$/m #% metainfo DC x=$1.strip lang=SiSU_Env::Standardise_language.new(x.dup) @dc_language[:code]=lang.code @dc_language[:name]=lang.title - when /^(?:0~language\.original|@language\.original:)\s+(.+?)$/m #% metainfo DC + when /^(?:0~language\.original|@language\.original:)\s+(.+?)$/m #% metainfo DC x=$1.strip lang=SiSU_Env::Standardise_language.new(x.dup) @language_original[:name]=lang.title - when /^(?:0~relation|@relation:)\s+(.+?)$/m; @dc_relation=$1 #% metainfo DC - when /^(?:0~coverage|@coverage:)\s+(.+?)$/m; @dc_coverage=$1 #% metainfo DC - when /^(?:0~rights|@rights:)\s+(.+?)$/m; @dc_rights=$1 #% metainfo DC copyright, public domain, copyleft, creative commons, etc. - when /^(?:0~papersize|@papersize:)\s+(.+?)$/m #% metainfo DC + when /^(?:0~relation|@relation:)\s+(.+?)$/m; @dc_relation=$1 #% metainfo DC + when /^(?:0~coverage|@coverage:)\s+(.+?)$/m; @dc_coverage=$1 #% metainfo DC + when /^(?:0~rights|@rights:)\s+(.+?)$/m; @dc_rights=$1.gsub(/<(?:\/\s*)?br(?:\s*\/)?>/,Mx[:br_line]) #% metainfo DC copyright, public domain, copyleft, creative commons, etc. + when /^(?:0~papersize|@papersize:)\s+(.+?)$/m #% metainfo DC l=$1 if @mod.inspect !~/--papersize[=-]\S+/ l=determine_papersize(l.dup) @papersize=l end - when /^(?:0~keywords?|@keywords?:?)\s+(.+?)$/m; @keywords=$1 #% metainfo DC - when /^(?:0~comments?|@comments?:?)\s+(.+?)$/m; @comments=$1 #% metainfo DC - when /^(?:0~abstract|@abstract)\s+(.+?)$/m; @abstract=$1 #% metainfo DC - when /^(?:0~tags?|@tags?:)\s+\S/m #% metainfo + when /^(?:0~keywords?|@keywords?:?)\s+(.+?)$/m; @keywords=$1 #% metainfo DC + when /^(?:0~comments?|@comments?:?)\s+(.+?)$/m; @comments=$1.gsub(/<(?:\/\s*)?br(?:\s*\/)?>/,Mx[:br_line]) #% metainfo DC + when /^(?:0~abstract|@abstract)\s+(.+?)$/m; @abstract=$1.gsub(/<(?:\/\s*)?br(?:\s*\/)?>/,Mx[:br_line]) #% metainfo DC + when /^(?:0~tags?|@tags?:)\s+\S/m #% metainfo tags=para.match(/^(?:0~tags?|@tags?:)\s+(.+)\Z/m)[1] tags.split(/,|$/).each do |tag| tag.strip! @@ -403,18 +403,18 @@ module SiSU_Param tag_a=tag_a.split(/:/).join('][') @tag_a << tag_a end - when /^(?:0~catalogue|@catalogue:)\s+(.+)?$/m #% metainfo + when /^(?:0~catalogue|@catalogue:)\s+(.+)?$/m #% metainfo m=$1 @cls_pg=m.match(/pg=(\S+)/)[1] if m =~/pg=/ @cls_isbn=m.match(/isbn=(\S+)/)[1] if m =~/isbn=/ @cls_dewey=m.match(/dewey=(\S+)/)[1] if m =~/dewey=/ @cls_loc=m.match(/loc=(\S+)/)[1] if m =~/loc=/ - when /^(?:0~class(?:ify)?_loc|@class(?:ify)?_loc:)\s+(.+?)$/m; @cls_loc=$1 #% metainfo - when /^(?:0~class(?:ify)?_dewey|@class(?:ify)?_dewey:)\s+(.+?)$/m; @cls_dewey=$1 #% metainfo - when /^(?:0~class(?:ify)?_pg|@class(?:ify)?_pg)\s+(.+?)$/m; @cls_pg=$1 #% metainfo - when /^(?:0~(?:class(?:ify)?_)?isbn|@(?:class(?:ify)?_)?isbn)\s+(\S+?)$/m; @cls_isbn=$1 #% metainfo - when /^(?:0~images?|@images?:)\s+(.+?)$/m; @image=$1 #% processing - when /^(?:0~(?:toc|structure)|@(?:toc|structure):)\s+(.+?)\Z/m #% processing + when /^(?:0~class(?:ify)?_loc|@class(?:ify)?_loc:)\s+(.+?)$/m; @cls_loc=$1 #% metainfo + when /^(?:0~class(?:ify)?_dewey|@class(?:ify)?_dewey:)\s+(.+?)$/m; @cls_dewey=$1 #% metainfo + when /^(?:0~class(?:ify)?_pg|@class(?:ify)?_pg)\s+(.+?)$/m; @cls_pg=$1 #% metainfo + when /^(?:0~(?:class(?:ify)?_)?isbn|@(?:class(?:ify)?_)?isbn)\s+(\S+?)$/m; @cls_isbn=$1 #% metainfo + when /^(?:0~images?|@images?:)\s+(.+?)$/m; @image=$1 #% processing + when /^(?:0~(?:toc|structure)|@(?:toc|structure):)\s+(.+?)\Z/m #% processing doc_toc_str=$1 @toc=doc_toc_str.split(/;\s*/) @toc=[ @toc ] if @toc == String diff --git a/lib/sisu/v0/shared_xml.rb b/lib/sisu/v0/shared_xml.rb index 9203f0df..228a5c14 100644 --- a/lib/sisu/v0/shared_xml.rb +++ b/lib/sisu/v0/shared_xml.rb @@ -166,6 +166,8 @@ module SiSU_XML_munge #¢£¥§©ª«®°±²³µ¶¹º»¼½¾×÷ ##para.gsub!(//, '&#;') ##para.gsub!(//, '&;') + para.gsub!(//u, '>') # '>' # > para.gsub!(/¢/u, '¢') # '¢' # ¢ para.gsub!(/£/u, '£') # '£' # £ para.gsub!(/¥/u, '¥') # '¥' # ¥ @@ -250,10 +252,23 @@ module SiSU_XML_munge para.gsub!(/ü/u, 'ý') # 'ü' # ý para.gsub!(/þ/u, 'þ') # 'þ' # þ para.gsub!(/ÿ/u, 'ÿ') # 'ÿ' # ÿ + para.gsub!(/‘/u, '‘') # '‘' # ‘ + para.gsub!(/’/u, '’') # '’' # ’ + para.gsub!(/–/u, '–') # – # – + para.gsub!(/—/u, '—') # — # — + para.gsub!(/∝/u, '∝') # ∝ # ∝ + para.gsub!(/∞/u, '∞') # ∞ # ∞ + para.gsub!(/™/u, '™') # ™ # ™ + para.gsub!(/✠/u, '†') # † # † incorrect replacement! † + para.gsub!(/ /u, ' ') # space identify + para.gsub!(/ /u, ' ') # space identify end end def html(para='') if @sys.locale =~/utf-?8/i # instead ucs for utf8 #require 'iconv' ? Iñtërnâtiônàlizætiøn + para.gsub!(/ /u, ' ') # space identify + para.gsub!(/ /u, ' ') # space identify + else para.gsub!(/¢/u, '¢') # ¢ para.gsub!(/£/u, '£') # £ para.gsub!(/¥/u, '¥') # ¥ @@ -338,6 +353,16 @@ module SiSU_XML_munge para.gsub!(/ü/u, 'ü') # ý para.gsub!(/þ/u, 'þ') # þ para.gsub!(/ÿ/u, 'ÿ') # ÿ + para.gsub!(/‘/u, '&#lsquo;') # ‘ # ‘ + para.gsub!(/’/u, '&#rsquo;') # ’ # ’ + para.gsub!(/–/u, '–') # – # – + para.gsub!(/—/u, '—') # — # — + para.gsub!(/∝/u, '∝') # ∝ # ∝ + para.gsub!(/∞/u, '∞') # ∞ # ∞ + para.gsub!(/™/u, '™') # ™ # ™ + para.gsub!(/✠/u, '†') # † # † incorrect replacement † + para.gsub!(/ /u, ' ') # space identify + para.gsub!(/ /u, ' ') # space identify end end self @@ -381,6 +406,10 @@ module SiSU_XML_munge %{[\\1] \\4}) para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?#{Mx[:lnk_c]}(https?:\/\/\S+)/, %{\\1}) + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))\s+(\d+)x(\d+)(\s+[^}]+)?#{Mx[:lnk_c]}image/, + %{[\\1] \\4}) + para.gsub!(/(?:^|[^_\\])#{Mx[:lnk_o]}\s*(\S+?\.(?:jpg|png|gif))(\s+[^}]+)?#{Mx[:lnk_c]}image/, + %{\\1}) para.gsub!(/(^|#{Mx[:gl_c]}|\s)#{Mx[:lnk_o]}(.+?)#{Mx[:lnk_c]}(https?:\/\/[^"><]+?)([,.:;"><]?(?=\s|$))/, '\1\2\4') #watch, compare html_tune para.gsub!(/(^|#{Mx[:gl_c]}|\s)((?:https?|file|ftp):\/\/\S+?\.[^'"><\s]+?)([;.,]?(?=\s|$))/, diff --git a/lib/sisu/v0/xhtml.rb b/lib/sisu/v0/xhtml.rb index 95da5d2d..5f99cacf 100644 --- a/lib/sisu/v0/xhtml.rb +++ b/lib/sisu/v0/xhtml.rb @@ -267,8 +267,8 @@ WOK (0..6).each { |x| @cont[x]=@level[x]=false } (4..6).each { |x| @xml_contents_close[x]='' } data.each do |para| - para=@trans.markup(para) @trans.char_enc.utf8(para) if @sys.locale =~/utf-?8/i #% utf8 + para=@trans.markup(para) if para =~/^#{Rx[:meta]}\s*.+?$/ # for headers d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta if d_meta; xml_head(d_meta) diff --git a/lib/sisu/v0/xml.rb b/lib/sisu/v0/xml.rb index 4826a503..49c71b88 100644 --- a/lib/sisu/v0/xml.rb +++ b/lib/sisu/v0/xml.rb @@ -291,8 +291,8 @@ WOK (0..6).each { |x| @cont[x]=@level[x]=false } (4..6).each { |x| @xml_contents_close[x]='' } data.each do |para| - para=@trans.markup(para) @trans.char_enc.utf8(para) if @sys.locale =~/utf-?8/i #% utf8 + para=@trans.markup(para) if para =~/^#{Rx[:meta]}\s*.+?$/ # for headers d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta if d_meta; xml_head(d_meta) diff --git a/lib/sisu/v0/xml_dom.rb b/lib/sisu/v0/xml_dom.rb index b2bc0de7..13aed504 100644 --- a/lib/sisu/v0/xml_dom.rb +++ b/lib/sisu/v0/xml_dom.rb @@ -349,8 +349,8 @@ WOK (0..6).each { |x| @cont[x]=@level[x]=false } (4..6).each { |x| @xml_contents_close[x]='' } data.each do |para| - para=@trans.markup(para) @trans.char_enc.utf8(para) if @sys.locale =~/utf-?8/i #% utf8 + para=@trans.markup(para) if para =~/^#{Rx[:meta]}\s*(.+?)$/ # for headers d_meta=SiSU_text_utils::Header_scan.new(@md,para).meta if d_meta; xml_head(d_meta) -- cgit v1.2.3