diff options
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/sisu/v5/db_import.rb | 34 | ||||
| -rw-r--r-- | lib/sisu/v5/db_sqltxt.rb | 28 | ||||
| -rw-r--r-- | lib/sisu/v6/db_import.rb | 34 | ||||
| -rw-r--r-- | lib/sisu/v6/db_sqltxt.rb | 28 | 
4 files changed, 78 insertions, 46 deletions
| diff --git a/lib/sisu/v5/db_import.rb b/lib/sisu/v5/db_import.rb index 59cff28a..72fb3753 100644 --- a/lib/sisu/v5/db_import.rb +++ b/lib/sisu/v5/db_import.rb @@ -334,17 +334,17 @@ module SiSU_DbImport          @en,@en_ast,@en_pls,@tuple_array=[],[],[],[]          @col[:en_a],@col[:en_z]=nil,nil          ao_array.each do |data| -          data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ') -          data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check +          data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1'). +            gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1'). +            gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1'). +            gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1'). +            gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1'). +            gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1'). +            gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1'). +            gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1'). +            gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1'). +            gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 '). +            gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check            @col[:seg]=@@seg            if data.of ==:para \            || data.of ==:heading \ @@ -374,7 +374,7 @@ module SiSU_DbImport                @col[:lid]+=1                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -425,7 +425,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -470,7 +470,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -501,7 +501,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -532,7 +532,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -603,7 +603,7 @@ module SiSU_DbImport                else                  SiSU_FormatShared::CSS_Format.new(@md,data).norm                end -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) diff --git a/lib/sisu/v5/db_sqltxt.rb b/lib/sisu/v5/db_sqltxt.rb index 6585fd66..3f6cf951 100644 --- a/lib/sisu/v5/db_sqltxt.rb +++ b/lib/sisu/v5/db_sqltxt.rb @@ -60,7 +60,7 @@  module SiSU_DbText    class Prepare      def special_character_escape(str) -      str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") +      str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")          gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql          gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"<br>\n").          gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check @@ -80,13 +80,29 @@ module SiSU_DbText            gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').            gsub(/ \s+/m,' ')          #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ -        s +        txt_arr << s        end -      txt_arr << arr << en -      #txt_arr=txt_arr.flatten +      txt_arr=txt_arr << en        txt=txt_arr.flatten.join("\n") -      txt=special_character_escape(txt) -      txt +      special_character_escape(txt) +    end +    def clean_document_objects_body(arr) +      txt_arr,en,en_arr=[],[],[] +      arr=(arr.is_a?(String)) ? [ arr ] : arr +      arr.each do |s| +        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) +        s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'<sup>\1</sup>'). +          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). +          gsub(/ \s+/m,' ') +        txt_arr << s +      end +      en.flatten.each do |e| +        e=e.sub(/^(\d+)\s*/,'<sup>\1</sup> ') +        en_arr << e +      end +      txt_arr=txt_arr << en_arr +      txt=txt_arr.flatten.join("\n<br>") +      special_character_escape(txt)      end      def clean_searchable_text_from_document_source(arr)        txt_arr,en=[],[] diff --git a/lib/sisu/v6/db_import.rb b/lib/sisu/v6/db_import.rb index 9473863d..5e159451 100644 --- a/lib/sisu/v6/db_import.rb +++ b/lib/sisu/v6/db_import.rb @@ -334,17 +334,17 @@ module SiSU_DbImport          @en,@en_ast,@en_pls,@tuple_array=[],[],[],[]          @col[:en_a],@col[:en_z]=nil,nil          ao_array.each do |data| -          data.obj.gsub!(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1') -          data.obj.gsub!(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 ') -          data.obj.gsub!(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check +          data.obj=data.obj.gsub(/#{Mx[:fa_bold_o]}(.+?)#{Mx[:fa_bold_c]}/,'\1'). +            gsub(/#{Mx[:fa_italics_o]}(.+?)#{Mx[:fa_italics_c]}/,'\1'). +            gsub(/#{Mx[:fa_underscore_o]}(.+?)#{Mx[:fa_underscore_c]}/,'\1'). +            gsub(/#{Mx[:fa_superscript_o]}(.+?)#{Mx[:fa_superscript_c]}/,'\1'). +            gsub(/#{Mx[:fa_subscript_o]}(.+?)#{Mx[:fa_subscript_c]}/,'\1'). +            gsub(/#{Mx[:fa_insert_o]}(.+?)#{Mx[:fa_insert_c]}/,'\1'). +            gsub(/#{Mx[:fa_cite_o]}(.+?)#{Mx[:fa_cite_c]}/,'\1'). +            gsub(/#{Mx[:fa_strike_o]}(.+?)#{Mx[:fa_strike_c]}/,'\1'). +            gsub(/#{Mx[:fa_monospace_o]}(.+?)#{Mx[:fa_monospace_c]}/,'\1'). +            gsub(/#{Mx[:gl_o]}(●)#{Mx[:gl_c]}\s*/,'\1 '). +            gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/,'') #check            @col[:seg]=@@seg            if data.of ==:para \            || data.of ==:heading \ @@ -374,7 +374,7 @@ module SiSU_DbImport                @col[:lid]+=1                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_minus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -425,7 +425,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -470,7 +470,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -501,7 +501,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -532,7 +532,7 @@ module SiSU_DbImport                @base_url="#{@env.url.root}/#{@md.fnb}/#{@hname}.html"                txt=endnotes(txt).extract_any                body=SiSU_FormatShared::CSS_Format.new(@md,data).lev4_plus -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) @@ -603,7 +603,7 @@ module SiSU_DbImport                else                  SiSU_FormatShared::CSS_Format.new(@md,data).norm                end -              @col[:body]=special_character_escape(body) +              @col[:body]=clean_document_objects_body(body)                plaintext=@col[:body].dup                plaintext=strip_markup(plaintext)                @col[:plaintext]=clean_searchable_text_from_document_objects(plaintext) diff --git a/lib/sisu/v6/db_sqltxt.rb b/lib/sisu/v6/db_sqltxt.rb index 2fd39fb7..2375d5ca 100644 --- a/lib/sisu/v6/db_sqltxt.rb +++ b/lib/sisu/v6/db_sqltxt.rb @@ -60,7 +60,7 @@  module SiSU_DbText    class Prepare      def special_character_escape(str) -      str=str.gsub(/'/,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'") +      str=str.gsub(/'/m,"''"). #string.gsub!(/'/,"\047") #string.gsub!(/'/,"\\'")          gsub(/(\\)/m,'\1\1'). #ok but with warnings, double backslash on sqlite #str.gsub!(/[\\]/m,'\\x5C') #ok but with warnings, but not for sqlite #str.gsub!(/(\\)/m,'\1') #ok for sqlite not for pgsql          gsub(/#{Mx[:br_line]}|#{Mx[:br_nl]}/m,"<br>\n").          gsub(/#{Mx[:tag_o]}\S+?#{Mx[:tag_c]}/m,''). #check @@ -80,13 +80,29 @@ module SiSU_DbText            gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,'').            gsub(/ \s+/m,' ')          #p s if s =~/[^ \nA-Za-z0-9'"`?!#@$%^&*=+,.;:\[\]()<>{}‹›|\\\/~_-]/ -        s +        txt_arr << s        end -      txt_arr << arr << en -      #txt_arr=txt_arr.flatten +      txt_arr=txt_arr << en        txt=txt_arr.flatten.join("\n") -      txt=special_character_escape(txt) -      txt +      special_character_escape(txt) +    end +    def clean_document_objects_body(arr) +      txt_arr,en,en_arr=[],[],[] +      arr=(arr.is_a?(String)) ? [ arr ] : arr +      arr.each do |s| +        en << s.scan(/#{Mx[:en_a_o]}\s*(.+?)\s*#{Mx[:en_a_c]}/m) +        s=s.gsub(/#{Mx[:en_a_o]}\s*(\d+).+?#{Mx[:en_a_c]}/m,'<sup>\1</sup>'). +          gsub(/#{Mx[:en_b_o]}.+?#{Mx[:en_b_c]}/m,''). +          gsub(/ \s+/m,' ') +        txt_arr << s +      end +      en.flatten.each do |e| +        e=e.sub(/^(\d+)\s*/,'<sup>\1</sup> ') +        en_arr << e +      end +      txt_arr=txt_arr << en_arr +      txt=txt_arr.flatten.join("\n<br>") +      special_character_escape(txt)      end      def clean_searchable_text_from_document_source(arr)        txt_arr,en=[],[] | 
