Code: Select all
(***************************************************
Ant Movie Catalog importation script
www.antp.be/software/moviecatalog/
[Infos]
Authors=Fulvio53s03
Title=wikipedia.it
Description=import film e serie TV
Site=http://www.wikipedia.it
Language=IT
Version=0.2
Requires=4.2.1
Comments=puo' essere usato per ricerca informazioni tramite google (mode=0) oppure per estrarle direttamente da wikipedia.it (mode=1) inserendo l'URL direttamente. |Inserire il termine (Stagione....) nel titolo tradotto rende migliore la ricerca della cover in bing.com per le serie TV
License=*  The source code of the script can be used in   |*  another program only if full credits to Fulvio53s03*
GetInfo=1
RequiresMovies=1
[Options]
Mode=0|0|0=normal mode|1=batch mode (url)
Origine_foto=1|0|0=img da wiki|1=img da bing
[Parameters]
***************************************************)
// TypeSearch=1|1|1=search for exact expression|2=search for any word
// TypeOrder=3|3|1=order by Artist|2=order by film_o_serie Title|3=order by relevance
// needs the following units
// StringUtils7552.pas
//
program wiki;
uses
   StringUtils7552;
   
const
   UrlBase = 'https://it.wikipedia.org';                  // base url
   bing_Base  = 'https://www.bing.com/images/search?q=';
   debug_search = false;                                   // debug mode on/off su ricerca files
   debug_film_o_serie = false;                            // debug mode on/off su estrazione dati film_o_serie
   folder = 'f:\prova\';                                  // directory where to save files
   CRLFspace = CRLF + ' ';
//2016-02-10   tappo_fine_ricerca = '<li class="g">Tappodifinericercadaticongoogle</li><li class="g">';
   tappo_fine_ricerca = '<div class="g">Tappodifinericercadaticongoogle</div><div class="g">';    //2016-02-10
   MaxItems = 50;
var
   nuovo_anno, mese_anno, solo_anno, data_rilascio, label, stagione: String;
   Credit_address, Readdress, Page: String;
   film_o_serieName, ArtistName, firstcall, name, autori: String;
   Formato, Releases, artista: String;
   file_name, campo_URL: String;
   tipo_ricerca, pag_ricerca, save_value, ordine_lista, Salva_autore: String;
   value, episodi, lista_episodi, riquadro: string;
   CharNormal, CharAbNormal: string;
   film_o_serie_ok: boolean;
   BatchMode, origine_foto_opt, giri, anno_int: Integer;
   pos_chr, lgth_file_name: Integer;
   titolo_e_autore, titolo, autore, durata, cover: String;
   save_comments, commento_episodi, commento_skeda: String;
   batchlog, confbatch: TstringList;
   save_pag, artist, initchar, endchar: string;
   film_o_serieok: boolean;
   bing_url, stagione_special: string;
//------------------------------------------------------------------------------
// list of film_o_series
//------------------------------------------------------------------------------
procedure GetList;   
var
   Address, urlmusic, album, albumtest, genre: String;
   Save_address, Title_album, indir_album, indir_title_album: String;
   dati_album, init_control, end_control, init_ol, end_ol: String;
   Desc_ricerca: String;
   ctr_loop: integer;
   found: boolean;
   i, lgth_dati_album: integer;
begin
   PickTreeClear;                                                    // clear list
   desc_ricerca := file_name;
   if desc_ricerca = '' then
      desc_ricerca := film_o_serieName;
//fs2017.01.30   Address := 'https://www.google.it/search?as_q=' + film_o_serieName;
   Address := 'https://www.google.it/search?num=20&as_q=' + film_o_serieName;            //fs2017.01.30
   Address := Address + '&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all';
   Address := Address + '&as_sitesearch=it.wikipedia.org&as_occt=any&safe=images&as_filetype=&as_rights=';
   Address := UrlEncode(Address);
   Page := GetPage(Address);
   Save_address := Address;
   SetField(fieldURL, Address);
   HTMLdecode(Page);
   if debug_search then
      DumpPage(folder+'googleListPage.html', Page);                // debug
   found := True;
//   PickTreeAdd('List of albums found for' + '"' + AlbumName + '"', '');
   Desc_ricerca := 'Lista articoli trovati per  -' + Desc_ricerca + '"';
   PickTreeAdd(Desc_ricerca, '"');
   if debug_search then
      DumpPage(folder+'wikiList.txt', Page);                // debug
   if Pos('find anything', Page) <> 0 then
   begin
      LogMessage('Error while reading selection page - no results found for ' + Desc_ricerca);
      Found := False;
      exit;
   end;
   init_ol := '<ol>';                        //2015-11-25
   end_ol  := '</ol>';
   pag_ricerca := TextBetween(Page, init_ol, end_ol) + tappo_fine_ricerca + end_ol;  //elenco titoli richiesti e tappo ricerca
   if debug_search then
      DumpPage(folder+'wikiricerca.html', pag_ricerca);                // debug
   save_pag := pag_ricerca;
   init_control := '<div class="g">';     //2016-02-10
   end_control := '<div class="g">';      //2016-02-10
   dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
   //dati_album contiene recursivamente i dati delle pagine referenziate da google
   lgth_dati_album := length(dati_album);
   if debug_search then
      DumpPage(folder+'google_dati_album.txt', dati_album);                // debug
   delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
   if debug_search then
      DumpPage(folder+'wikiricerca-1.txt', Pag_ricerca);                // debug
   while (dati_album <> tappo_fine_ricerca) do                 //estraggo fino al tappo di fine ricerca
      begin
//*** ricerca dell'indirizzo da linkare    (N.B.  prima devi cercare https,  poi cercare http
      initchar := 'https://';            endchar  := '/%252';
      Address  := textBetween(dati_album, initchar, endchar);
      if length(address) < 1 then
          begin
          initchar := 'https://';        endchar  := '&sa';
          Address  := textBetween(dati_album, initchar, endchar);
          end;
      if length(address) < 1 then
         begin
         initchar := 'http://';           endchar  := '/&252';
         Address  := textBetween(dati_album, initchar, endchar);
         if length(address) < 1 then
              begin
              initchar := 'http://';     endchar  := '&sa';
              Address  := textBetween(dati_album, initchar, endchar);
              end;
         end;
      Address := initchar + Address;
//*** fine ricerca indirizzo
      initchar := '<h3 class="r">';
      endchar  := '</h3>';
      Title_album  := textBetween(dati_album,initchar, endchar);              //descrizione del link
      HTMLRemoveTags(Title_album);
//      Title_album := copy(Title_album, 1, 42);
      PickTreeAdd(Title_Album, Address);
      found := True;
      init_control := '<div class="g">';    //2016-02-10
      end_control := '<div class="g">';     //2016-02-10
      dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
      lgth_dati_album := length(dati_album);
      delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
      lgth_dati_album := length(Dati_album) - (length(init_control) + length(end_control));
      end;
//   end;
   if not found then
   begin
      LogMessage('No album found for ' + Desc_ricerca);
      exit;
   end;
   if PickTreeExec(Address) then
      begin
      AnalyzeMoviePage(Address);                                    // Album page
      end
   else
      LogMessage('No serie/film selected');
// ---------------------------------------
end;
//------------------------------------------------------------------------------
// ANALYZE Movie PAGE
//------------------------------------------------------------------------------
procedure AnalyzeMoviePage(Address: string);
var
   urlmusic, Lenght, Pays: String;
   Autore, style, Commenti, str_min, campo_note: String;
   minu, minuti, ore, lgth_comm, save_lgth_comm, i: integer;
   j: Real;
begin
   save_comments    := '';
   commento_episodi := '';
   commento_skeda   := '';
   estrazione_scheda(Address);
//   save_comments := commento_skeda + CRLF + commento_episodi;
//   SetField(fieldcomments, save_comments);     // commenti + elenco episodi           end
end;
//------------------------------------------------------------------------------
// ESTRAE INFORMAZIONI DA SCHEDA
//------------------------------------------------------------------------------
procedure estrazione_scheda(Address: string);
begin
   Page := GetPage(Address);
   SetField(fieldURL, Address);
   Page := UTF8decode(Page);
   HTMLdecode(Page);
   if debug_film_o_serie  then
      DumpPage(folder+'wikiPageDetail.txt', Page);                         // debug_film_o_serie
   if (BatchMode = 0)  and (tipo_ricerca = 'sk ep') then
      SetField(fieldURL, Address);
   campo_URL := getfield(fieldurl);
   SetField(fieldDate, DateToStr(Date));
   Normalizza_Page(Page);
   if debug_film_o_serie  then
      DumpPage(folder+'wikiPageDetailHTMLdecode.txt', Page);               // debug_film_o_serie
//   Page := UTF8Decode(Page);                                              //fs2015-01-15
   film_o_serieok := True;
   Value := Page;
//fs 2017.01.28   riquadro := textbetween(Value, '<div id="mw-content-text"', '</div>');
   riquadro := textbetween(Value, '<table class="sinottico"', '</table>');        //fs 2017.01.28
   if debug_film_o_serie  then
      DumpPage(folder+'wikiriquadro.txt', riquadro);
//*** titolo tradotto
   initchar := '<title>';        //fs2016-06-26
   endchar :=  '</title>';
   label := textbetween(Value, initchar, endchar);
   label := stringreplace(label, ' - Wikipedia', '');
   if getfield (fieldtranslatedTitle) = '' then
      SetField(fieldtranslatedTitle, label);
   
//*** Descrizione (trama)
   label := textbetween(value, '<span class="mw-headline" id="Trama">Trama</span>', '<h');
   label := textbetween(label, '<p>', '</p>');
//   label := textbetween(value, '<h2><span class="mw-headline" id="Trama">', '<h2>');
   if  label = '' then                                                           //2017.03.11 FS
       begin
          label := textbetween(value, '<span class="mw-headline" id="Descrizione">Descrizione</span>', '<h');
          label := textafter(label, '</h2>');
       end;
   if debug_film_o_serie  then
      DumpPage(folder+'wikiDescrizione.html', label);               // debug_film_o_serie
   if  label = '' then
       begin
       label := textafter(value, '</table>');
       label := textbefore(label, '<h2>', '');
       end;
       
   HTMLRemoveTags(label);
   label := fulltrim(label);
   SetField(fielddescription, label);
//*** commento (descrizione generale)
   initchar := '<p><i><b>';
   label := initchar + textbetween(value, initchar, '<div id="toc" class="toc">');
//   label := textbetween(label, '<p>', '</p>');
   HTMLRemoveTags(label);
   SetField(fieldcomments, label);
//   *** dati estratti dal riquadro sinottico ***
//*** country
   initchar := '>Paese';      endchar :=  '</tr>';
   label := TextBetween(riquadro, initchar, endchar);
   HTMLRemoveTags(label);
   label := stringReplace(label, 'di produzione', '');
   label := stringReplace(label, 'Stati Uniti d''America', 'USA');
   label := fulltrim(label);
   SetField(fieldcountry, label);
//*** cover       locandina da wikipedia  ------------------
   cover := 'https:' + TextBetween(riquadro, 'src="', '"');
   if origine_foto_opt = 0 then
      begin	
   	if cover <> 'https:' then
      	GetPicture(cover);
      end;
      
//*** cover       locandina da Bing -------------------
//   bing_url := bing_Base + UrlEncode(film_o_serieName) + '&qft=+filterui:imagesize-large+filterui:aspect-tall&FORM=R5IR28';
//   delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
   pos_chr := 0;
   pos_chr := pos('(stagione-)', film_o_serieName);
   if pos_chr = 0 then
      pos_chr := pos('(stagione-', film_o_serieName);
   if pos_chr = 0 then
      pos_chr := pos('(stagione', film_o_serieName);
   if pos_chr = 0 then
      pos_chr := pos(' stagione ', film_o_serieName);
   if pos_chr = 0 then
      pos_chr := pos(' serie ', film_o_serieName);
   lgth_file_name := length(film_o_seriename);
   if pos_chr = 0 then
      begin
        bing_url := bing_Base + UrlEncode('COVER ' + film_o_serieName);
        bing_url := bing_url  + '&qft=+filterui:imagesize-large+filterui:aspect-tall&FORM=R5IR28';
      end;
   if pos_chr > 0 then
      begin
//        bing_url := delete(film_o_seriename, pos_chr, (lgth_file_name - pos_chr));
        bing_url := film_o_seriename;
        delete(bing_url, (pos_chr - 1), (lgth_file_name - pos_chr + 2));
        bing_url := bing_Base + UrlEncode('SERIE ' + bing_url);
        bing_url := bing_url  + '&qft=+filterui:imagesize-large+filterui:aspect-tall&FORM=R5IR28';
      end;
   if origine_foto_opt = 1 then
      AnalyzeBing(bing_url);
//----------------- fine estrae cover da Bing -------------------
//*** Titolo Originale
   label := textbetween(riquadro, '>Titolo originale<', '</td>');
   initchar := '<td';           endchar :=  '</td>';
   label := initchar + textbetween(label, initchar, '</i>');
   label := FormatText(label);
   SetField(fieldoriginalTitle, label);
//*** genere
   initchar := '>Genere<';
   label := textbetween(riquadro, initchar, '</tr>');
   initchar := '<td';           endchar :=  '</td>';
   label := initchar + textbetween(label, initchar, endchar);
   HTMLRemoveTags(label);
   label    := AnsiMixedCase(AnsiLowerCase(label), ' ');
   SetField(fieldcategory, label);
//*** regia
   initchar := '>Regia</a>';     endchar :=  '</td>';
   label := TextBetween(riquadro, initchar, endchar);
   HTMLRemoveTags(label);
   label := FormatText(label);
   SetField(fieldDirector, label);
//*** personaggi e interpreti
   initchar := '<ul>';
   endchar :=  '</ul>';
   label := initchar + TextBetween(riquadro, initchar, endchar) + endchar;
   initchar := '<li>';
   label := initchar + TextBetween(riquadro, initchar, endchar) + endchar;
   HTMLRemoveTags(label);
   SetField(fieldactors, label);
//*** anno
   initchar := '>Anno';      endchar :=  '</a>';
   label := TextBetween(riquadro, initchar, endchar);
   label := stringReplace(label, ' - in produzione', '');
   HTMLRemoveTags(label);
//   label := FormatText(label);
   SetField(fieldyear, label);
   label := getfield(fieldyear);
//*** produttore
   initchar := '>Casa di produzione<';    endchar:= '</tr>';
   label := textbetween(riquadro, initchar, endchar);
   label := '<' + stringReplace(label, '<br />', ', ');
   HTMLRemoveTags(label);                        //   label := stringReplace(label, ' - in produzione', '');
   SetField(fieldproducer, label);
//*** writer (ideatore della serie TV)  -  soggetto (film)
   initchar := '>Soggetto';                   //film
   label := textbetween(value, initchar, '</tr>');
//   label := textbetween(label, '<td>', '</td>');
   HTMLRemoveTags(label);
   SetField(fieldwriter, label);
   if label = '' then
      begin
      initchar := '<th>Ideatore</th>';           //serie
      label := textbetween(value, initchar, '</tr>');
//    label := textbetween(label, '<td>', '</td>');
      HTMLRemoveTags(label);
      SetField(fieldwriter, label);
   end;
//*** compositore
   initchar := '>Musiche';    endchar:= '</tr>';
   label := textbetween(riquadro, initchar, endchar);
   label := '<' + stringReplace(label, '<br />', ', ');
   HTMLRemoveTags(label);                        //   label := stringReplace(label, ' - in produzione', '');
   SetField(fieldcomposer, label);
end;
Procedure Normalizza_page(Pagina: string);    // elimina i crlf, trasforma delimiters maiuscoli in minuscoli
begin
  pagina := RegExprSetReplace('<([^>]+)>', pagina, '<\L\1>', true);
  CharAbNormal := crlf;   CharNormal := ' ';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '<B';   CharNormal := '<b';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '</B';  CharNormal := '</b';
  pagina := StringReplace(Value, CharAbNormal, CharNormal);
  CharAbNormal := '<FONT';  CharNormal := '<font';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '</FONT';   CharNormal := '</font';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '<TR';   CharNormal := '<tr';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '</TR';   CharNormal := '</tr';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '<TD';     CharNormal := '<td';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '</TD';    CharNormal := '</td';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '<DIV';   CharNormal := '<div';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '</DIV';   CharNormal := '</div';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '<Ol';    CharNormal := '<ol';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
  CharAbNormal := '</Ol';   CharNormal := '</ol';
  pagina := StringReplace(pagina, CharAbNormal, CharNormal);
end;
//------------------------------------------------------------------------------
// set show warning (normal mode) or add to log (batch mode)
//------------------------------------------------------------------------------
procedure LogMessage(m: string);
begin
   if BatchMode > 0 then
      AddToLog('item '+GetField(fieldNumber)+': '+m)
   else
      ShowWarning(m);
end;
//------------------------------------------------------------------------------
// add a message in the batch log and save to disk
// (because I don't know when it's finished...)
//------------------------------------------------------------------------------
procedure AddToLog(m: string);
begin
//fs2016-12-31    batchlog.Add(m);
//fs2016-12-31    batchlog.SaveToFile(batchlogfic);
end;
//------------------------------------------------------------------------------
// process batch mode
//------------------------------------------------------------------------------
procedure wikiBatch;
begin
   film_o_serieName := GetField(fieldUrl);                      // if no url or another site then ignore
   if (film_o_serieName <> '') and (Pos(UrlBase, film_o_serieName) > 0) then
      AnalyzeMoviePage(film_o_serieName)
   else
      LogMessage('ignored url="'+film_o_serieName+'"');
end;   
//------------------------------------------------------------------------------
// process normal mode
//------------------------------------------------------------------------------
procedure wikiNorm;
begin
   file_name:= getField(fieldfilepath);
   if (GetField(fieldTranslatedTitle) <> '') then
      film_o_serieName := GetField(fieldTranslatedTitle)          // get film_o_serie name
   else
      film_o_serieName := GetField(fieldOriginalTitle);							//mrobama           
   repeat
    if not Input('wiki = ' + ArtistName + ' - ' + film_o_serieName, 'Scrivi il nome del film o della serie:'
        + crlf, film_o_serieName) or (film_o_serieName = '') then exit;
    formato := getfield(FieldOriginalTitle);
    if pos(formato, 'serie') > 0 then
       formato:= 'serie';
    GetList;
       until film_o_serieok;
end;
//---------------------------- estrazione immagine da Bing ------------------------
// ---
function RemovePar(wholetext: string) : string;
var
  str1: String;
  i: Integer;
begin
  str1 := Trim(TextBefore(wholetext, '(', ''));
  if str1 <> '' then
  begin
    if Pos(')', RemainingText) > 0 then
      wholetext := str1+' '+Trim(TextAfter(RemainingText, ')'));  // + end of text or ''
  end;
  result := Trim(wholetext);
end;
// ---
procedure AnalyzeBing(bing_Address: string);
var
  ImgCnt: Integer;
  PageText,strRef,strImg,strTest, Title_photo: string;
begin
  // Read the whole page
  PageText := GetPage(bing_Address);
  if debug_search then
     DumpPage(folder+'bingSearchPage.html', PageText);                // debug
  initchar := '<a class="thumb"';          endchar := '<div class="fileInfo">';
  strTest := initchar + TextBetween(PageText, initchar, endchar) + endchar;
  if (strTest = '') then
      break;
  // Check if there is a class with the url inside this href
  if pos('class="thumb"', strTest) > 0 then                          //class="thumb"
    begin
    // Get the img reference
      initchar := 'href="';          endchar := '"';
      strImg := TextBetween(strTest, initchar, endchar);
      initchar := '<div class="des">';          endchar := '</div>';
      Title_photo := Textbetween(strtest, initchar, endchar);
      Title_photo:= FormatText(Title_photo);
      GetPicture (strImg);
    end;
end;
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
//  start here
//------------------------------------------------------------------------------                                             
begin
      if not CheckVersion(4,2,1) then
      begin
         ShowMessage('This script requires a newer version of Ant Movie Catalog (at least the version 4.2.1)');
         exit;
      end;   
// get user's parms (used more than once)
      BatchMode := GetOption('Mode');
      origine_foto_opt := GetOption('Origine_foto');
// *********************************************************************
      if BatchMode = 1 then                                
          wikiBatch;
      if BatchMode = 0 then
          wikiNorm;
end.


