[UPD] sergiobonellieditore.it fumetti

New scripts, templates and translation files that allows to use Ant Movie Catalog to manage other things than movies
Post Reply
fulvio53s03
Posts: 764
Joined: 2007-04-28 05:46:43
Location: Italy

[UPD] sergiobonellieditore.it fumetti

Post by fulvio53s03 »

Nuova Release dello script di estrazione informazioni collane di fumetti Bonelli.
Il sito è stato completamente ristrutturato e l'estrazione dati si è rivelata piuttosto complessa, usando Google come motore di ricerca della pagina del fumetto.
Può funzionare in due modi:
- Batch indicando l'url di estrazione dei dati
- Norm trova con Google la pagina di estrazione dati in base al contenuto del campo 'titolo tradotto' (consiglio di compilare il campo con titolo albo e nome collana o personaggio).
In modalità Norm si può scegliere di automatizzare l'estrazione dati dalla prima segnalazione Google: di solito è quella giusta ma ci sono alcune eccezioni.
Segnalazioni e suggerimenti sono, come sempre, bene accetti.
;)

Code: Select all

(***************************************************

Ant Movie Catalog importation script
www.antp.be/software/moviecatalog/

[Infos]
Authors=Fulvio53s03
Title=sergiobonellieditore.it.ifs
Description=estrae dati albi Sergio Bonelli Editore
Site=
Language=IT
Version=2.0
Requires=4.2.1
Comments=puo' essere usato per ricerca informazioni tramite google (mode=0) oppure per estrarle direttamente da SergioBonelli.it (mode=1) inserendo l'URL direttamente.
License=*  The source code of the script can be used in   |*  another program only if full credits to Fulvio53s03*
GetInfo=1
RequiresMovies=1

[Options]
Mode=1|0|0=normal mode|1=batch mode (url)
Primo_OK=1|0|0=normal mode|1=primo link (sono fortunato)

[Parameters]

***************************************************)

program sergiobonellieditore;
uses
  StringUtils7552;   // Script needs external unit StringUtils1.pas in scripts folder !
const
  debug_search = false;                                   // debug mode on/off su ricerca dischi
  debug_film_o_serie = false;                             // debug mode on/off su estrazione dati film_o_serie
  folder = 'f:\prova\';                                   // directory where to save files
  tappo = CRLF + 'xyzxyz';
  tappo_fine_ricerca = '<div class="g">Tappodifinericercadaticongoogle</div><div class="g">';    //2016-02-10

var
  ComicURL, ComicSeries, ComicNumber, MediaType, Collana: string;   // Define some script variables
  Page, SavePage, pag_ricerca, Save_pag, Value, saveValue : string;
  InitChar, EndChar, CharAbNormal, CharNormal : String;
  BatchMode, Fortunato:   integer;
  sw_serie, StartDelimiter, endDelimiter, saveActors : string;
  NumFumetto, numCollana : integer;
  LgthPage, lungo : integer;
  CharCut, StartPos, EndPos: integer;

  film_o_serieok: boolean;
  file_name, film_o_serieName: string;

Procedure NormalizePage;
begin
  CharAbNormal := 'Soggetto e Sceneggiatura:';
  CharNormal := 'Soggetto e sceneggiatura:';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '<B';
  CharNormal := '<b';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '</B';
  CharNormal := '</b';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '<DIV ';
  CharNormal := '<div ';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '</DIV>';
  CharNormal := '</div>';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  SavePage := Page;
end;

// ***** Analyze Item's Page *****
procedure AnalyzePageAlbo(URL: String);   // Variable "URL" is handed over (former variable "ComicURL")
var
  Descr_Img: string;
  Pict_dim: double;
  Pict_width, Pict_height: Integer;
begin
  film_o_serieok := True;
  Page := GetPage(URL);   // Fetch source code from website and store inside "Page"
  NormalizePage;
  Page:= UTF8Decode(Page);
  if debug_film_o_serie  then
     DumpPage(folder+'BonelliPageDetail.txt', Page);                         // debug_film_o_serie// Data di aggiornamento
  SetField(fieldDate, DateToStr(Date));
  SetField(fieldURL, URL);

// fieldActors / Autori    new
  Value := '';
  startdelimiter := 'class="vc_eco_product_body">';
  endDelimiter := '<link rel="stylesheet" href="shortcodeTemplates/css/articlePreview.css">';
  Value := TextBetween(Page, startdelimiter, enddelimiter);             // extract part from variable "Page"

  Value := removespaces(Value,True);                                    // cancella spazi tra ><
  Value := StringReplace(Value, '<span class="nome">', (CRLF + '<span class="nome">'));
  HTMLDecode(Value);   // Clean HTML codes (if some exist)
  HTMLRemoveTags(Value);
  delete(Value,0,1);                    // cancella il primo CRLF
  Value := FullTrim (Value);            // Clean up
  Value := Stringreplace(Value, '  ', ' ');
  Value := Stringreplace(Value, '  ', ' ');
  Value := Stringreplace(Value, (' ' + CRLF), CRLF);
  saveActors := Value;
  SetField(fieldActors, Value);         // Save to field Actors

// trama e attori
//2017-07-13  Value := TextBetween(page, '<script type="application/ld+json">', ' </script>');   // extract description part from variable "Page"
//2017-07-13  Value := TextBetween(Value, '"text":"', '","keywords"');
  Value := TextBetween(page, '<div itemprop="text" class="testo_articolo testo testoResize">', ' </div>');   //2017-07-13

  if debug_film_o_serie  then
     DumpPage(folder+'Bonellitrama.txt', value);
  HTMLDecode(Value);   // Clean description from HTML codes (if some exist)
  Value := StringReplace(Value, '<br /><br />', ('<br />' + tappo));
  if pos(tappo, Value) = 0 then                                            //2017.07.26
      Value := StringReplace(Value, ('</p>' + CRLF + '<p>'), ('<br />' + tappo));        //2017.07.26
  Value := StringReplace(Value, '</b>', 'qwer');                           //2017-07-13
  Value := Stringreplace(Value, '</strong>', CRLF);
  Value := Stringreplace(Value, '> ', '>');
  Value := Stringreplace(Value, '>' + CRLF, '>');                          //2017.07.21
  Value := Stringreplace(Value, 'qwer', CRLF);
  Value := Stringreplace(Value, (' ' + CRLF), CRLF);
  Value := Stringreplace(Value, (CRLF + CRLF), CRLF);
  Value := Stringreplace(Value, (CRLF + CRLF), CRLF);

  HTMLRemoveTags(Value);
  Value := FullTrim(Value);   // Clean up the description
  if length(saveactors) = 0 then
     begin
        saveactors := textbefore(value, tappo, '');
        value := textafter(value, tappo);
        Setfield (fieldActors, Saveactors);
     end;
  SetField(fieldDescription, Value);   // Save description to field Description

// Picture
  Value := '';   // Make sure "Value" is empty
  Value := TextBetween(Page, '<meta itemprop="thumbnailUrl" content="', '" />');   // extract the picture URL from "Page"
//  Value := TextBetween(Page, '<meta itemprop="thumbnailUrl" content="', '?');   // extract the picture URL from "Page"
//  Value := TextBetween(Page, '<meta itemprop="thumbnailUrl" content="', '.jpg') + '.jpg';   // extract the picture URL from "Page"
  if Value <> '' then
     begin
     GetPicture(Value);
     Pict_width	 := GetPictureWidth;
     Pict_height	 := GetPictureHeight;
     Pict_dim	 := GetPictureSize;
     Descr_Img := IntToStr(Pict_width) + 'x' + IntToStr(Pict_height);
     Pict_dim := Pict_dim / 1000;
     setfield(fieldsize, (FloatToStr(Pict_dim) + ' kB'));
     setfield(fieldresolution, Descr_Img)
  end;

// numero
  Value := '';   // Make sure "Value" is empty
  Value := TextBetween(Page, '<p class="tag_1 vc_tag">', '</p>');
  startdelimiter := '<span  class="valore"';
  endDelimiter := '</span>';
  Value := startdelimiter + TextBetween(Value, startdelimiter, enddelimiter) + enddelimiter;
  HTMLDecode(Value);   // Clean description from HTML codes (if some exist)
  HTMLRemoveTags(Value);   // Clean title from HTML tags (if some exist)
  Value := FullTrim (Value);
  SetField(fieldMedia, Value);   // Save title to field Label

// collana                   new
  Value := TextBetween(Page, '<span class="nome">Collana</span>', '</p>');
  HTMLDecode(Value);   // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);   // Clean title from HTML tags (if some exist)
  Value := FullTrim (Value);
  SetField(fieldMediaType, Value);   // Save title to field Label

// Titolo tradotto              new
  Value := '';
  Value := TextBetween(Page, '<h1 itemprop="headline name" class="titolo_articolo titolo">', '</h1>');   // extract title part from variable "Page"
  HTMLDecode(Value);   // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);   // Clean title from HTML tags (if some exist)
  Value := FullTrim (Value);
  SetField(fieldTranslatedTitle, Value);   // Save title to field TranslatedTitle

// source         new                  giugno 2010 - mensile
  Savevalue := '';
  StartDelimiter := '<span class="nome">Periodicità:</span>';
  EndDelimiter := '</span>';
  Value := Startdelimiter + TextBetween(Page, StartDelimiter, EndDelimiter) + enddelimiter;
  Value := removespaces(Value,True);                                    // cancella spazi tra ><
  HTMLDecode(Value);                      // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);                  // Clean title from HTML tags (if some exist)
  if length(Value) > 0 then
      SaveValue := Value;
//
  StartDelimiter := '<span class="nome">uscita</span>';
  EndDelimiter := '</span>';
  Value := Startdelimiter + TextBetween(Page, StartDelimiter, EndDelimiter) + enddelimiter;
  Value := removespaces(Value,True);      // cancella spazi tra ><
  HTMLDecode(Value);                      // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);                  // Clean title from HTML tags (if some exist)
  if length(Value) > 0 then
      Value := Savevalue + ' - ' + Value;
  Value := FullTrim(Value);
  SetField(fieldSource, Value);           // Save data pubblicazione to Field Source
end; // *********************** End of procedure "AnalyzePageAlbo" *****************************************

//------------------------------------------------------------------------------
// process batch mode                        da sergiobonelli.it
//------------------------------------------------------------------------------
procedure BonelliBatch;
begin
   comicURL := getfield(FieldUrl);
   if comicURL <> '' then
      AnalyzePageAlbo(comicURL)
   else
      ShowMessage('Indirizzo non esistente - indicare l"URL da elaborare');
end;

//------------------------------------------------------------------------------
// process normal mode                          da sergiobonelli.it
//------------------------------------------------------------------------------
procedure BonelliNorm;
begin
   film_o_serieok := False;
   file_name:= getField(fieldfilepath);
   if (GetField(fieldTranslatedTitle) <> '') then							
      film_o_serieName := GetField(fieldTranslatedTitle)          // get film_o_serie name
   else													
      film_o_serieName := GetField(fieldOriginalTitle);							

   if film_o_serieName = '' then
        film_o_seriename := getfield(fieldMediaType) + ' ' + getfield(fieldMedia);
   repeat GetList;
      until film_o_serieok;

end;
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------
// list of film_o_series                        da sergiobonelli.it
//------------------------------------------------------------------------------
procedure GetList;   
var
   Address, urlmusic, album, albumtest, genre: String;
   Save_address, Title_album, indir_album, indir_title_album: String;
   dati_album, init_control, end_control, init_ol, end_ol: String;
   Desc_ricerca: String;
   ctr_loop, posiz: integer;
   found: boolean;
   i, lgth_dati_album: integer;
begin

   PickTreeClear;                                                    // clear list
   desc_ricerca := file_name;
   if desc_ricerca = '' then
      desc_ricerca := film_o_serieName;
   Address := 'https://www.google.it/search?as_q=' + film_o_serieName;
   Address := Address + '&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all';
   Address := Address + '&as_sitesearch=shop.sergiobonelli.it&as_occt=any&safe=images&as_filetype=&as_rights=';
//   Address := 'https://www.google.it/search?as_q=tex%20664&as_sitesearch=shop.sergiobonelli.it';              //prova debug non estrae
   Address := UrlEncode(Address);
   Page := GetPage(Address);
   Save_address := Address;
   SetField(fieldURL, Address);
   HTMLdecode(Page);
   if debug_search then
      DumpPage(folder+'googleListPage.html', Page);                // debug

   found := True;
   Desc_ricerca := 'Lista articoli trovati per: "' + Desc_ricerca + '"';
   PickTreeAdd(Desc_ricerca, '"');
   if debug_search then
      DumpPage(folder+'discogsList.txt', Page);                // debug
   if Pos('find anything', Page) <> 0 then
   begin
      LogMessage('Error while reading selection page - no results found for ' + Desc_ricerca);
      Found := False;
      exit;
   end;

   init_ol := '<ol>';                        //2015-11-25
   end_ol  := '</ol>';
   pag_ricerca := TextBetween(Page, init_ol, end_ol) + tappo_fine_ricerca + end_ol;  //elenco titoli richiesti e tappo ricerca
   if debug_search then
      DumpPage(folder+'googlericerca.html', pag_ricerca);                // debug

   save_pag := pag_ricerca;
   init_control := '<div class="g">';     //2016-02-10
   end_control := '<div class="g">';      //2016-02-10
   dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
   //dati_album contiene recursivamente i dati delle pagine referenziate da google
   lgth_dati_album := length(dati_album);
   if debug_search then
      DumpPage(folder+'google_dati_album.html', dati_album);                // debug
   delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
   if debug_search then
      DumpPage(folder+'googlericerca-1.html', Pag_ricerca);                // debug

if fortunato = 1 then
   GetFirst
else
   begin
   while (dati_album <> tappo_fine_ricerca) do                 //estraggo fino al tappo di fine ricerca
      begin
//2017-01-02      initchar := ':https://';                   //fs *****************+ può essere sia http://  che https://
      initchar := 'https://';       //2017-01-04
      endchar  := '/%252';
      Address  := textBetween(dati_album, initchar, endchar);
      if length(address) < 1 then
          begin
          initchar := 'http://';       //2017-01-04
          Address  := textBetween(dati_album, initchar, endchar);
          end;
      initchar := 'https://';       //2017-01-04
      endchar  := '/&sa';         //2017-06-17
      endchar  := '&sa';           //2017-06-17
      Address  := textBetween(dati_album, initchar, endchar);
      if length(address) < 1 then
          begin
          initchar := 'http://';       //2017-01-04
          Address  := textBetween(dati_album, initchar, endchar);
          end;

//      Address  := 'https://' + Address;

      initchar := '<h3 class="r">';
      endchar  := '</h3>';
      Title_album  := textBetween(dati_album,initchar, endchar);              //descrizione del link
      HTMLRemoveTags(Title_album);
      Address := StringReplace(Address, 'en.shop.', 'en.KO');        //il riferimento inglese sarà scartato
      posiz :=  pos('shop.sergiobonelli', Address) + pos('/scheda/', Address);
      if (pos('shop.sergiobonelli', Address) > 0)
        or (pos('/scheda/', Address) > 0) then
          begin
          Address := 'http://' + Address;
          Title_Album := Title_Album + '  (' + copy(Address, 1, 28) + ')';
          PickTreeAdd(Title_Album, Address);
          found := True;
      end;

      init_control := '<div class="g">';    //2016-02-10
      end_control := '<div class="g">';     //2016-02-10
      dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
      lgth_dati_album := length(dati_album);
      delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
      lgth_dati_album := length(Dati_album) - (length(init_control) + length(end_control));
      end;
//   end;

   if not found then
   begin
      ShowMessage('Nessun fumetto trovato per ' + Desc_ricerca);
      exit;
   end;
   if PickTreeExec(Address) then
      begin
      CharAbNormal := 'https://';
      CharNormal :=  '';
//    shop.sergiobonelli.it/tex/2017/04/04/albo/gli-incappucciati-del-klan-1000852/
//      Address := StringReplace(Address, CharAbNormal, CharNormal) + '/';
      AnalyzePageAlbo(Address);                                    // Albo page
      end
   else
      ShowMessage('No serie/film selected');
end;
// ---------------------------------------
end;

//------------------------------------------------------------------------------
// Prendi il primo indirizzo                        da sergiobonelli.it
//------------------------------------------------------------------------------
procedure GetFirst;   
var
   Address, urlmusic, album, albumtest, genre: String;
   Save_address, Title_album, indir_album, indir_title_album: String;
   dati_album, init_control, end_control, init_ol, end_ol: String;
   Desc_ricerca: String;
   ctr_loop, posiz: integer;
   found: boolean;
   i, lgth_dati_album: integer;
begin

   PickTreeClear;                                                    // clear list
   desc_ricerca := file_name;
   if desc_ricerca = '' then
      desc_ricerca := film_o_serieName;
   Address := 'https://www.google.it/search?as_q=' + film_o_serieName;
   Address := Address + '&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all';
   Address := Address + '&as_sitesearch=shop.sergiobonelli.it&as_occt=any&safe=images&as_filetype=&as_rights=';
   Address := UrlEncode(Address);
   Page := GetPage(Address);
   Save_address := Address;
   SetField(fieldURL, Address);
   HTMLdecode(Page);
   if debug_search then
      DumpPage(folder+'googleListPage.html', Page);                // debug

   found := True;
//   PickTreeAdd('List of albums found for' + '"' + AlbumName + '"', '');
   Desc_ricerca := 'Lista articoli trovati per: "' + Desc_ricerca + '"';
   PickTreeAdd(Desc_ricerca, '"');
   if debug_search then
      DumpPage(folder+'discogsList.txt', Page);                // debug
   if Pos('find anything', Page) <> 0 then
   begin
      LogMessage('Error while reading selection page - no results found for ' + Desc_ricerca);
      Found := False;
      exit;
   end;

   init_ol := '<ol>';                        //2015-11-25
   end_ol  := '</ol>';
   pag_ricerca := TextBetween(Page, init_ol, end_ol) + tappo_fine_ricerca + end_ol;  //elenco titoli richiesti e tappo ricerca
   if debug_search then
      DumpPage(folder+'googlericerca.html', pag_ricerca);                // debug

   save_pag := pag_ricerca;
   init_control := '<div class="g">';     //2016-02-10
   end_control := '<div class="g">';      //2016-02-10
   dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
   //dati_album contiene recursivamente i dati delle pagine referenziate da google
   lgth_dati_album := length(dati_album);
   if debug_search then
      DumpPage(folder+'google_dati_album.html', dati_album);                // debug
   delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
   if debug_search then
      DumpPage(folder+'googlericerca-1.html', Pag_ricerca);                // debug

   initchar := 'https://';       //2017-01-04
   endchar  := '/%252';
   Address  := textBetween(dati_album, initchar, endchar);
    if length(address) < 1 then
       begin
       initchar := 'http://';       //2017-01-04
       Address  := textBetween(dati_album, initchar, endchar);
       end;
   initchar := 'https://';       //2017-01-04
   endchar  := '/&sa';         //2017-06-17
   endchar  := '&sa';           //2017-06-17
   Address  := textBetween(dati_album, initchar, endchar);
   if length(address) < 1 then
       begin
       initchar := 'http://';       //2017-01-04
       Address  := textBetween(dati_album, initchar, endchar);
       end;

   Address := StringReplace(Address, 'en.shop.', 'en.KO');        //il riferimento inglese sarà scartato
   posiz :=  pos('shop.sergiobonelli', Address) + pos('/scheda/', Address);
   if (pos('shop.sergiobonelli', Address) > 0)
     or (pos('/scheda/', Address) > 0) then
       begin
         Address := 'http://' + Address;
         PickTreeAdd(Title_Album, Address);
         found := True;
       end;

   if not found then
   begin
      ShowMessage('Nessun fumetto trovato per ' + Desc_ricerca);
      exit;
   end;
   AnalyzePageAlbo(Address);                                    // Albo page
end;

// ***** Beginning of the script *****
begin
      if not CheckVersion(4,2,1) then
      begin
         ShowMessage('This script requires a newer version of Ant Movie Catalog (at least the version 4.2.1)');
         exit;
      end;
// get user's parms (used more than once)
      Fortunato := GetOption('Primo_OK');
      BatchMode := GetOption('Mode');
      if BatchMode = 1 then
          BonelliBatch;
      if BatchMode = 0 then
          BonelliNorm;
end.
Post Reply