aboutsummaryrefslogtreecommitdiffhomepage
path: root/lib/sisu/v2/db_import.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/sisu/v2/db_import.rb')
-rw-r--r--lib/sisu/v2/db_import.rb120
1 files changed, 71 insertions, 49 deletions
diff --git a/lib/sisu/v2/db_import.rb b/lib/sisu/v2/db_import.rb
index 45aca11b..e351f6fc 100644
--- a/lib/sisu/v2/db_import.rb
+++ b/lib/sisu/v2/db_import.rb
@@ -122,7 +122,7 @@ module SiSU_DB_import
tell.puts_blue unless @opt.cmd =~/q/
tell=SiSU_Screen::Ansi.new(@opt.cmd,'Marshal Load',@fnc)
tell.print_grey if @opt.cmd =~/v/
- select_first_match=%{ SELECT metadata_and_text.tid FROM metadata_and_text WHERE metadata_and_text.filename = '#{@opt.fns}'; }
+ select_first_match=%{ SELECT metadata_and_text.tid FROM metadata_and_text WHERE metadata_and_text.src_filename = '#{@opt.fns}'; }
file_exist=@sql_type=~/sqlite/ \
? @conn.get_first_value(select_first_match) \
: @conn.select_one(select_first_match)
@@ -265,7 +265,10 @@ module SiSU_DB_import
@col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_minus
special_character_escape(@col[:body])
@col[:plaintext]=@col[:body].dup
- strip_markup(@col[:plaintext])
+ @col[:plaintext]=strip_markup(@col[:plaintext])
+ @col[:plaintext]=clean_searchable_text(@col[:plaintext])
+ @col[:words]=@col[:plaintext].dup
+ @col[:words]=unique_words(@col[:words])
if @en[0]; @en_a,@en_z=@en[0].first,@en[0].last
end
if @en_ast[0]; @en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last
@@ -298,7 +301,10 @@ module SiSU_DB_import
@col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus
special_character_escape(@col[:body])
@col[:plaintext]=@col[:body].dup
- strip_markup(@col[:plaintext])
+ @col[:plaintext]=strip_markup(@col[:plaintext])
+ @col[:plaintext]=clean_searchable_text(@col[:plaintext])
+ @col[:words]=@col[:plaintext].dup
+ @col[:words]=unique_words(@col[:words])
@en_a,@en_z=@en[0].first,@en[0].last if @en[0]
@en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
@en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -324,7 +330,10 @@ module SiSU_DB_import
@col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus
special_character_escape(@col[:body])
@col[:plaintext]=@col[:body].dup
- strip_markup(@col[:plaintext])
+ @col[:plaintext]=strip_markup(@col[:plaintext])
+ @col[:plaintext]=clean_searchable_text(@col[:plaintext])
+ @col[:words]=@col[:plaintext].dup
+ @col[:words]=unique_words(@col[:words])
@en_a,@en_z=@en[0].first,@en[0].last if @en[0]
@en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
@en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -350,7 +359,10 @@ module SiSU_DB_import
@col[:body]=SiSU_Format_Shared::CSS_Format.new(@md,data).lev4_plus
special_character_escape(@col[:body])
@col[:plaintext]=@col[:body].dup
- strip_markup(@col[:plaintext])
+ @col[:plaintext]=strip_markup(@col[:plaintext])
+ @col[:plaintext]=clean_searchable_text(@col[:plaintext])
+ @col[:words]=@col[:plaintext].dup
+ @col[:words]=unique_words(@col[:words])
@en_a,@en_z=@en[0].first,@en[0].last if @en[0]
@en_a_asterisk,@en_z_asterisk=@en_ast[0].first,@en_ast[0].last if @en_ast[0]
@en_a_plus,@en_z_plus=@en_pls[0].first,@en_pls[0].last if @en_pls[0]
@@ -389,12 +401,15 @@ module SiSU_DB_import
end
special_character_escape(@col[:body])
@col[:plaintext]=@col[:body].dup
- strip_markup(@col[:plaintext])
+ @col[:plaintext]=strip_markup(@col[:plaintext])
+ @col[:plaintext]=clean_searchable_text(@col[:plaintext])
+ @col[:words]=@col[:plaintext].dup
+ @col[:words]=unique_words(@col[:words])
t=SiSU_DB_tuple::Load_documents.new(@conn,@col,@opt,@file)
@tuple_array << t.tuple
@en,@en_ast,@en_pls=[],[],[]
@col[:en_a]=@col[:en_z]=nil
- @col[:lev]=@col[:plaintext]=@col[:body]=''
+ @col[:lev]=@col[:plaintext]=@col[:body]=@col[:words]=''
end
if notedata =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/ #% import into database endnotes tables
endnote_array=notedata.scan(/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/)
@@ -406,9 +421,9 @@ module SiSU_DB_import
@id_n+=1
special_character_escape(txt)
body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt)
- #special_character_escape(body)
- #special_character_escape(txt)
strip_markup(txt)
+ words=txt.dup
+ words=unique_words(words)
if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1)
puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n"
open("#{Dir.pwd}/pg_documents_error_log",'a') do |error|
@@ -418,16 +433,17 @@ module SiSU_DB_import
end
if txt
en={ :type => 'endnotes',
- :id => @id_n,
- :lid => @col[:lid],
- :nr => nr,
- :txt => txt,
- :body => body,
- :ocn => @col[:ocn],
- :ocnd => @col[:ocnd],
- :ocns => @col[:ocns],
- :id_t => @@id_t,
- :hash => digest_clean
+ :id => @id_n,
+ :lid => @col[:lid],
+ :nr => nr,
+ :txt => txt,
+ :body => body,
+ :words => words,
+ :ocn => @col[:ocn],
+ :ocnd => @col[:ocnd],
+ :ocns => @col[:ocns],
+ :id_t => @@id_t,
+ :hash => digest_clean
}
t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file)
@tuple_array << t.tuple
@@ -447,6 +463,8 @@ module SiSU_DB_import
special_character_escape(txt)
body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt)
strip_markup(txt)
+ words=txt.dup
+ words=unique_words(words)
if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1)
puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n"
open("#{Dir.pwd}/pg_documents_error_log",'a') do |error|
@@ -456,16 +474,17 @@ module SiSU_DB_import
end
if txt
en={ :type => 'endnotes_asterisk',
- :id => @id_n,
- :lid => @col[:lid],
- :nr => nr,
- :txt => txt,
- :body => body,
- :ocn => @col[:ocn],
- :ocnd => @col[:ocnd],
- :ocns => @col[:ocns],
- :id_t => @@id_t,
- :hash => digest_clean
+ :id => @id_n,
+ :lid => @col[:lid],
+ :nr => nr,
+ :txt => txt,
+ :body => body,
+ :words => words,
+ :ocn => @col[:ocn],
+ :ocnd => @col[:ocnd],
+ :ocns => @col[:ocns],
+ :id_t => @@id_t,
+ :hash => digest_clean
}
t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file)
@tuple_array << t.tuple
@@ -485,6 +504,8 @@ module SiSU_DB_import
special_character_escape(txt)
body=SiSU_Format_Shared::CSS_Format.new(@md,data).endnote(nr,txt)
strip_markup(txt)
+ words=txt.dup
+ words=unique_words(words)
if txt.size > (SiSU_DB_columns::Column_size.new.endnote_clean - 1)
puts "\n\nTOO LARGE (ENDNOTE - see error log)\n\n"
open("#{Dir.pwd}/pg_documents_error_log",'a') do |error|
@@ -494,16 +515,17 @@ module SiSU_DB_import
end
if txt
en={ :type => 'endnotes_plus',
- :id => @id_n,
- :lid => @col[:lid],
- :nr => nr,
- :txt => txt,
- :body => body,
- :ocn => @col[:ocn],
- :ocnd => @col[:ocnd],
- :ocns => @col[:ocns],
- :id_t => @@id_t,
- :hash => digest_clean
+ :id => @id_n,
+ :lid => @col[:lid],
+ :nr => nr,
+ :txt => txt,
+ :body => body,
+ :words => words,
+ :ocn => @col[:ocn],
+ :ocnd => @col[:ocnd],
+ :ocns => @col[:ocns],
+ :id_t => @@id_t,
+ :hash => digest_clean
}
t=SiSU_DB_tuple::Load_endnotes.new(@conn,en,@opt,@file)
@tuple_array << t.tuple
@@ -526,25 +548,25 @@ module SiSU_DB_import
endnotes(@txt).range
@en << endnotes(@txt).standard if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/
@en_ast << endnotes(@txt).asterisk if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/
- @en_pls << endnotes(@txt).plus if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_o]}/
+ @en_pls << endnotes(@txt).plus if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/
@txt=endnotes(@txt).clean_text
end
@txt
end
def standard
- x=if @txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/; @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/)
- else nil
- end
+ x=(@txt =~/#{Mx[:en_a_o]}.+?#{Mx[:en_a_c]}/) \
+ ? @txt.scan(/#{Mx[:en_a_o]}(\d+).+?#{Mx[:en_a_c]}/) \
+ : nil
end
def asterisk
- x=if @txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/)
- else nil
- end
+ x=(@txt =~/#{Mx[:en_b_o]}\*.+?#{Mx[:en_b_c]}/) \
+ ? @txt.scan(/#{Mx[:en_b_o]}[*](\d+).+?#{Mx[:en_b_c]}/) \
+ : nil
end
def plus
- x=if @txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/; @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/)
- else nil
- end
+ x=(@txt =~/#{Mx[:en_b_o]}\+.+?#{Mx[:en_b_c]}/) \
+ ? @txt.scan(/#{Mx[:en_b_o]}[+](\d+).+?#{Mx[:en_b_c]}/) \
+ : nil
end
def clean_text(base_url=nil)
if base_url