diff options
Diffstat (limited to 'lib/sisu/v2/db_import.rb')
| -rw-r--r-- | lib/sisu/v2/db_import.rb | 120 | 
1 files changed, 71 insertions, 49 deletions
| diff --git a/lib/sisu/v2/db_import.rb b/lib/sisu/v2/db_import.rb index 45aca11b..e351f6fc 100644 --- a/lib/sisu/v2/db_import.rb +++ b/lib/sisu/v2/db_import.rb @@ -122,7 +122,7 @@ module SiSU_DB_import        tell.puts_blue unless @opt.cmd =~/q/        tell=SiSU_Screen::Ansi.new(@opt.cmd,'Marshal Load',@fnc)        tell.print_grey if @opt.cmd =~/v/ -      select_first_match=%{ SELECT metadata_and_text.tid FROM metadata_and_text WHERE metadata_and_text.filename = '#{@opt.fns}'; } +      select_first_match=%{ SELECT metadata_and_text.tid FROM metadata_and_text WHERE metadata_and_text.src_filename = '#{@opt.fns}'; }        file_exist=@sql_type=~/sqlite/ \        ? @conn.get_first_value(select_first_match) \        : @conn.select_one(select_first_match) @@ -265,7 +265,10 @@ module SiSU_DB_import                @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_minus                special_character_escape(@col[:body])                @col[:plaintext]=@col[:body].dup -              strip_markup(@col[:plaintext]) +              @col[:plaintext]=strip_markup(@col[:plaintext]) +              @col[:plaintext]=clean_searchable_text(@col[:plaintext]) +              @col[:words]=@col[:plaintext].dup +              @col[:words]=unique_words(@col[:words])                if @en[0]; @en_a,@en_z=@en[0].first,@en[0].last                end                if @en_ast[0]; @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last @@ -298,7 +301,10 @@ module SiSU_DB_import                @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus                special_character_escape(@col[:body])                @col[:plaintext]=@col[:body].dup -              strip_markup(@col[:plaintext]) +              @col[:plaintext]=strip_markup(@col[:plaintext]) +              @col[:plaintext]=clean_searchable_text(@col[:plaintext]) +              @col[:words]=@col[:plaintext].dup +              @col[:words]=unique_words(@col[:words])                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -324,7 +330,10 @@ module SiSU_DB_import                @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus                special_character_escape(@col[:body])                @col[:plaintext]=@col[:body].dup -              strip_markup(@col[:plaintext]) +              @col[:plaintext]=strip_markup(@col[:plaintext]) +              @col[:plaintext]=clean_searchable_text(@col[:plaintext]) +              @col[:words]=@col[:plaintext].dup +              @col[:words]=unique_words(@col[:words])                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -350,7 +359,10 @@ module SiSU_DB_import                @col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus                special_character_escape(@col[:body])                @col[:plaintext]=@col[:body].dup -              strip_markup(@col[:plaintext]) +              @col[:plaintext]=strip_markup(@col[:plaintext]) +              @col[:plaintext]=clean_searchable_text(@col[:plaintext]) +              @col[:words]=@col[:plaintext].dup +              @col[:words]=unique_words(@col[:words])                @en_a,@en_z=@en[0].first,@en[0].last if @en[0]                @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]                @en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0] @@ -389,12 +401,15 @@ module SiSU_DB_import                end                special_character_escape(@col[:body])                @col[:plaintext]=@col[:body].dup -              strip_markup(@col[:plaintext]) +              @col[:plaintext]=strip_markup(@col[:plaintext]) +              @col[:plaintext]=clean_searchable_text(@col[:plaintext]) +              @col[:words]=@col[:plaintext].dup +              @col[:words]=unique_words(@col[:words])                t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file)                @tuple_array << t.tuple                @en,@en_ast,@en_pls=[],[],[]                @col[:en_a]=@col[:en_z]=nil -              @col[:lev]=@col[:plaintext]=@col[:body]='' +              @col[:lev]=@col[:plaintext]=@col[:body]=@col[:words]=''              end              if notedata =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/                                         #% import into database endnotes tables                endnote_array=notedata.scan(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) @@ -406,9 +421,9 @@ module SiSU_DB_import                    @id_n+=1                    special_character_escape(txt)                    body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt) -                  #special_character_escape(body) -                  #special_character_escape(txt)                    strip_markup(txt) +                  words=txt.dup +                  words=unique_words(words)                    if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1)                      puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n"                      open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| @@ -418,16 +433,17 @@ module SiSU_DB_import                    end                    if txt                      en={ :type => 'endnotes', -                      :id   => @id_n, -                      :lid  => @col[:lid], -                      :nr   => nr, -                      :txt  => txt, -                      :body => body, -                      :ocn  => @col[:ocn], -                      :ocnd => @col[:ocnd], -                      :ocns => @col[:ocns], -                      :id_t => @@id_t, -                      :hash => digest_clean +                      :id      => @id_n, +                      :lid     => @col[:lid], +                      :nr      => nr, +                      :txt     => txt, +                      :body    => body, +                      :words   => words, +                      :ocn     => @col[:ocn], +                      :ocnd    => @col[:ocnd], +                      :ocns    => @col[:ocns], +                      :id_t    => @@id_t, +                      :hash    => digest_clean                      }                      t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file)                      @tuple_array << t.tuple @@ -447,6 +463,8 @@ module SiSU_DB_import                    special_character_escape(txt)                    body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt)                    strip_markup(txt) +                  words=txt.dup +                  words=unique_words(words)                    if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1)                      puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n"                      open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| @@ -456,16 +474,17 @@ module SiSU_DB_import                    end                    if txt                      en={ :type => 'endnotes_asterisk', -                      :id   => @id_n, -                      :lid  => @col[:lid], -                      :nr   => nr, -                      :txt  => txt, -                      :body => body, -                      :ocn  => @col[:ocn], -                      :ocnd => @col[:ocnd], -                      :ocns => @col[:ocns], -                      :id_t => @@id_t, -                      :hash => digest_clean +                      :id      => @id_n, +                      :lid     => @col[:lid], +                      :nr      => nr, +                      :txt     => txt, +                      :body    => body, +                      :words   => words, +                      :ocn     => @col[:ocn], +                      :ocnd    => @col[:ocnd], +                      :ocns    => @col[:ocns], +                      :id_t    => @@id_t, +                      :hash    => digest_clean                      }                      t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file)                      @tuple_array << t.tuple @@ -485,6 +504,8 @@ module SiSU_DB_import                    special_character_escape(txt)                    body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt)                    strip_markup(txt) +                  words=txt.dup +                  words=unique_words(words)                    if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1)                      puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n"                      open("#{Dir.pwd}/pg_documents_error_log",'a') do |error| @@ -494,16 +515,17 @@ module SiSU_DB_import                    end                    if txt                      en={ :type => 'endnotes_plus', -                      :id   => @id_n, -                      :lid  => @col[:lid], -                      :nr   => nr, -                      :txt  => txt, -                      :body => body, -                      :ocn  => @col[:ocn], -                      :ocnd => @col[:ocnd], -                      :ocns => @col[:ocns], -                      :id_t => @@id_t, -                      :hash => digest_clean +                      :id      => @id_n, +                      :lid     => @col[:lid], +                      :nr      => nr, +                      :txt     => txt, +                      :body    => body, +                      :words   => words, +                      :ocn     => @col[:ocn], +                      :ocnd    => @col[:ocnd], +                      :ocns    => @col[:ocns], +                      :id_t    => @@id_t, +                      :hash    => digest_clean                      }                      t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file)                      @tuple_array << t.tuple @@ -526,25 +548,25 @@ module SiSU_DB_import            endnotes(@txt).range            @en << endnotes(@txt).standard if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/            @en_ast << endnotes(@txt).asterisk if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/ -          @en_pls << endnotes(@txt).plus if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_o]}/ +          @en_pls << endnotes(@txt).plus if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/            @txt=endnotes(@txt).clean_text          end          @txt        end        def standard -        x=if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/; @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/) -        else nil -        end +        x=(@txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) \ +        ? @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/) \ +        : nil        end        def asterisk -        x=if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/) -        else nil -        end +        x=(@txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/) \ +        ? @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/) \ +        : nil        end        def plus -        x=if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/) -        else nil -        end +        x=(@txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/) \ +        ? @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/) \ +        : nil        end        def clean_text(base_url=nil)          if base_url | 
