[ antp.be > Forum ]

[ FAQFAQ - SearchSearch - MemberlistMemberlist - UsergroupsUsergroups - ]

[ RegisterRegister - ProfileProfile - Log in to check your private messagesLog in to check your private messages - Log inLog in ]


[UPD] sergiobonellieditore.it fumetti

 
Post new topic   Reply to topic    www.antp.be Forum Index -> Ant Movie Catalog > Mods
Author Message
fulvio53s03
PostPosted: 2017-07-28 08:59:02    Post subject: [UPD] sergiobonellieditore.it fumetti Reply with quote
View user's profile Send private message Nuova Release dello script di estrazione informazioni collane di fumetti Bonelli.
Il sito è stato completamente ristrutturato e l'estrazione dati si è rivelata piuttosto complessa, usando Google come motore di ricerca della pagina del fumetto.
Può funzionare in due modi:
- Batch indicando l'url di estrazione dei dati
- Norm trova con Google la pagina di estrazione dati in base al contenuto del campo 'titolo tradotto' (consiglio di compilare il campo con titolo albo e nome collana o personaggio).
In modalità Norm si può scegliere di automatizzare l'estrazione dati dalla prima segnalazione Google: di solito è quella giusta ma ci sono alcune eccezioni.
Segnalazioni e suggerimenti sono, come sempre, bene accetti.
wink
Code:
(***************************************************

Ant Movie Catalog importation script
www.antp.be/software/moviecatalog/

[Infos]
Authors=Fulvio53s03
Title=sergiobonellieditore.it.ifs
Description=estrae dati albi Sergio Bonelli Editore
Site=
Language=IT
Version=2.0
Requires=4.2.1
Comments=puo' essere usato per ricerca informazioni tramite google (mode=0) oppure per estrarle direttamente da SergioBonelli.it (mode=1) inserendo l'URL direttamente.
License=*  The source code of the script can be used in   |*  another program only if full credits to Fulvio53s03*
GetInfo=1
RequiresMovies=1

[Options]
Mode=1|0|0=normal mode|1=batch mode (url)
Primo_OK=1|0|0=normal mode|1=primo link (sono fortunato)

[Parameters]

***************************************************)

program sergiobonellieditore;
uses
  StringUtils7552;   // Script needs external unit StringUtils1.pas in scripts folder !
const
  debug_search = false;                                   // debug mode on/off su ricerca dischi
  debug_film_o_serie = false;                             // debug mode on/off su estrazione dati film_o_serie
  folder = 'f:\prova\';                                   // directory where to save files
  tappo = CRLF + 'xyzxyz';
  tappo_fine_ricerca = '<div class="g">Tappodifinericercadaticongoogle</div><div class="g">';    //2016-02-10

var
  ComicURL, ComicSeries, ComicNumber, MediaType, Collana: string;   // Define some script variables
  Page, SavePage, pag_ricerca, Save_pag, Value, saveValue : string;
  InitChar, EndChar, CharAbNormal, CharNormal : String;
  BatchMode, Fortunato:   integer;
  sw_serie, StartDelimiter, endDelimiter, saveActors : string;
  NumFumetto, numCollana : integer;
  LgthPage, lungo : integer;
  CharCut, StartPos, EndPos: integer;

  film_o_serieok: boolean;
  file_name, film_o_serieName: string;

Procedure NormalizePage;
begin
  CharAbNormal := 'Soggetto e Sceneggiatura:';
  CharNormal := 'Soggetto e sceneggiatura:';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '<B';
  CharNormal := '<b';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '</B';
  CharNormal := '</b';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '<DIV ';
  CharNormal := '<div ';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  CharAbNormal := '</DIV>';
  CharNormal := '</div>';
  Page := StringReplace(Page, CharAbNormal, CharNormal);
  SavePage := Page;
end;

// ***** Analyze Item's Page *****
procedure AnalyzePageAlbo(URL: String);   // Variable "URL" is handed over (former variable "ComicURL")
var
  Descr_Img: string;
  Pict_dim: double;
  Pict_width, Pict_height: Integer;
begin
  film_o_serieok := True;
  Page := GetPage(URL);   // Fetch source code from website and store inside "Page"
  NormalizePage;
  Page:= UTF8Decode(Page);
  if debug_film_o_serie  then
     DumpPage(folder+'BonelliPageDetail.txt', Page);                         // debug_film_o_serie// Data di aggiornamento
  SetField(fieldDate, DateToStr(Date));
  SetField(fieldURL, URL);

// fieldActors / Autori    new
  Value := '';
  startdelimiter := 'class="vc_eco_product_body">';
  endDelimiter := '<link rel="stylesheet" href="shortcodeTemplates/css/articlePreview.css">';
  Value := TextBetween(Page, startdelimiter, enddelimiter);             // extract part from variable "Page"

  Value := removespaces(Value,True);                                    // cancella spazi tra ><
  Value := StringReplace(Value, '<span class="nome">', (CRLF + '<span class="nome">'));
  HTMLDecode(Value);   // Clean HTML codes (if some exist)
  HTMLRemoveTags(Value);
  delete(Value,0,1);                    // cancella il primo CRLF
  Value := FullTrim (Value);            // Clean up
  Value := Stringreplace(Value, '  ', ' ');
  Value := Stringreplace(Value, '  ', ' ');
  Value := Stringreplace(Value, (' ' + CRLF), CRLF);
  saveActors := Value;
  SetField(fieldActors, Value);         // Save to field Actors

// trama e attori
//2017-07-13  Value := TextBetween(page, '<script type="application/ld+json">', ' </script>');   // extract description part from variable "Page"
//2017-07-13  Value := TextBetween(Value, '"text":"', '","keywords"');
  Value := TextBetween(page, '<div itemprop="text" class="testo_articolo testo testoResize">', ' </div>');   //2017-07-13

  if debug_film_o_serie  then
     DumpPage(folder+'Bonellitrama.txt', value);
  HTMLDecode(Value);   // Clean description from HTML codes (if some exist)
  Value := StringReplace(Value, '<br /><br />', ('<br />' + tappo));
  if pos(tappo, Value) = 0 then                                            //2017.07.26
      Value := StringReplace(Value, ('</p>' + CRLF + '<p>'), ('<br />' + tappo));        //2017.07.26
  Value := StringReplace(Value, '</b>', 'qwer');                           //2017-07-13
  Value := Stringreplace(Value, '</strong>', CRLF);
  Value := Stringreplace(Value, '> ', '>');
  Value := Stringreplace(Value, '>' + CRLF, '>');                          //2017.07.21
  Value := Stringreplace(Value, 'qwer', CRLF);
  Value := Stringreplace(Value, (' ' + CRLF), CRLF);
  Value := Stringreplace(Value, (CRLF + CRLF), CRLF);
  Value := Stringreplace(Value, (CRLF + CRLF), CRLF);

  HTMLRemoveTags(Value);
  Value := FullTrim(Value);   // Clean up the description
  if length(saveactors) = 0 then
     begin
        saveactors := textbefore(value, tappo, '');
        value := textafter(value, tappo);
        Setfield (fieldActors, Saveactors);
     end;
  SetField(fieldDescription, Value);   // Save description to field Description

// Picture
  Value := '';   // Make sure "Value" is empty
  Value := TextBetween(Page, '<meta itemprop="thumbnailUrl" content="', '" />');   // extract the picture URL from "Page"
//  Value := TextBetween(Page, '<meta itemprop="thumbnailUrl" content="', '?');   // extract the picture URL from "Page"
//  Value := TextBetween(Page, '<meta itemprop="thumbnailUrl" content="', '.jpg') + '.jpg';   // extract the picture URL from "Page"
  if Value <> '' then
     begin
     GetPicture(Value);
     Pict_width    := GetPictureWidth;
     Pict_height    := GetPictureHeight;
     Pict_dim    := GetPictureSize;
     Descr_Img := IntToStr(Pict_width) + 'x' + IntToStr(Pict_height);
     Pict_dim := Pict_dim / 1000;
     setfield(fieldsize, (FloatToStr(Pict_dim) + ' kB'));
     setfield(fieldresolution, Descr_Img)
  end;

// numero
  Value := '';   // Make sure "Value" is empty
  Value := TextBetween(Page, '<p class="tag_1 vc_tag">', '</p>');
  startdelimiter := '<span  class="valore"';
  endDelimiter := '</span>';
  Value := startdelimiter + TextBetween(Value, startdelimiter, enddelimiter) + enddelimiter;
  HTMLDecode(Value);   // Clean description from HTML codes (if some exist)
  HTMLRemoveTags(Value);   // Clean title from HTML tags (if some exist)
  Value := FullTrim (Value);
  SetField(fieldMedia, Value);   // Save title to field Label

// collana                   new
  Value := TextBetween(Page, '<span class="nome">Collana</span>', '</p>');
  HTMLDecode(Value);   // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);   // Clean title from HTML tags (if some exist)
  Value := FullTrim (Value);
  SetField(fieldMediaType, Value);   // Save title to field Label

// Titolo tradotto              new
  Value := '';
  Value := TextBetween(Page, '<h1 itemprop="headline name" class="titolo_articolo titolo">', '</h1>');   // extract title part from variable "Page"
  HTMLDecode(Value);   // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);   // Clean title from HTML tags (if some exist)
  Value := FullTrim (Value);
  SetField(fieldTranslatedTitle, Value);   // Save title to field TranslatedTitle

// source         new                  giugno 2010 - mensile
  Savevalue := '';
  StartDelimiter := '<span class="nome">Periodicit&agrave;:</span>';
  EndDelimiter := '</span>';
  Value := Startdelimiter + TextBetween(Page, StartDelimiter, EndDelimiter) + enddelimiter;
  Value := removespaces(Value,True);                                    // cancella spazi tra ><
  HTMLDecode(Value);                      // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);                  // Clean title from HTML tags (if some exist)
  if length(Value) > 0 then
      SaveValue := Value;
//
  StartDelimiter := '<span class="nome">uscita</span>';
  EndDelimiter := '</span>';
  Value := Startdelimiter + TextBetween(Page, StartDelimiter, EndDelimiter) + enddelimiter;
  Value := removespaces(Value,True);      // cancella spazi tra ><
  HTMLDecode(Value);                      // Clean from HTML codes (if some exist)
  HTMLRemoveTags(Value);                  // Clean title from HTML tags (if some exist)
  if length(Value) > 0 then
      Value := Savevalue + ' - ' + Value;
  Value := FullTrim(Value);
  SetField(fieldSource, Value);           // Save data pubblicazione to Field Source
end; // *********************** End of procedure "AnalyzePageAlbo" *****************************************

//------------------------------------------------------------------------------
// process batch mode                        da sergiobonelli.it
//------------------------------------------------------------------------------
procedure BonelliBatch;
begin
   comicURL := getfield(FieldUrl);
   if comicURL <> '' then
      AnalyzePageAlbo(comicURL)
   else
      ShowMessage('Indirizzo non esistente - indicare l"URL da elaborare');
end;

//------------------------------------------------------------------------------
// process normal mode                          da sergiobonelli.it
//------------------------------------------------------------------------------
procedure BonelliNorm;
begin
   film_o_serieok := False;
   file_name:= getField(fieldfilepath);
   if (GetField(fieldTranslatedTitle) <> '') then                     
      film_o_serieName := GetField(fieldTranslatedTitle)          // get film_o_serie name
   else                                       
      film_o_serieName := GetField(fieldOriginalTitle);                     

   if film_o_serieName = '' then
        film_o_seriename := getfield(fieldMediaType) + ' ' + getfield(fieldMedia);
   repeat GetList;
      until film_o_serieok;

end;
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------
// list of film_o_series                        da sergiobonelli.it
//------------------------------------------------------------------------------
procedure GetList;   
var
   Address, urlmusic, album, albumtest, genre: String;
   Save_address, Title_album, indir_album, indir_title_album: String;
   dati_album, init_control, end_control, init_ol, end_ol: String;
   Desc_ricerca: String;
   ctr_loop, posiz: integer;
   found: boolean;
   i, lgth_dati_album: integer;
begin

   PickTreeClear;                                                    // clear list
   desc_ricerca := file_name;
   if desc_ricerca = '' then
      desc_ricerca := film_o_serieName;
   Address := 'https://www.google.it/search?as_q=' + film_o_serieName;
   Address := Address + '&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all';
   Address := Address + '&as_sitesearch=shop.sergiobonelli.it&as_occt=any&safe=images&as_filetype=&as_rights=';
//   Address := 'https://www.google.it/search?as_q=tex%20664&as_sitesearch=shop.sergiobonelli.it';              //prova debug non estrae
   Address := UrlEncode(Address);
   Page := GetPage(Address);
   Save_address := Address;
   SetField(fieldURL, Address);
   HTMLdecode(Page);
   if debug_search then
      DumpPage(folder+'googleListPage.html', Page);                // debug

   found := True;
   Desc_ricerca := 'Lista articoli trovati per: "' + Desc_ricerca + '"';
   PickTreeAdd(Desc_ricerca, '"');
   if debug_search then
      DumpPage(folder+'discogsList.txt', Page);                // debug
   if Pos('find anything', Page) <> 0 then
   begin
      LogMessage('Error while reading selection page - no results found for ' + Desc_ricerca);
      Found := False;
      exit;
   end;

   init_ol := '<ol>';                        //2015-11-25
   end_ol  := '</ol>';
   pag_ricerca := TextBetween(Page, init_ol, end_ol) + tappo_fine_ricerca + end_ol;  //elenco titoli richiesti e tappo ricerca
   if debug_search then
      DumpPage(folder+'googlericerca.html', pag_ricerca);                // debug

   save_pag := pag_ricerca;
   init_control := '<div class="g">';     //2016-02-10
   end_control := '<div class="g">';      //2016-02-10
   dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
   //dati_album contiene recursivamente i dati delle pagine referenziate da google
   lgth_dati_album := length(dati_album);
   if debug_search then
      DumpPage(folder+'google_dati_album.html', dati_album);                // debug
   delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
   if debug_search then
      DumpPage(folder+'googlericerca-1.html', Pag_ricerca);                // debug

if fortunato = 1 then
   GetFirst
else
   begin
   while (dati_album <> tappo_fine_ricerca) do                 //estraggo fino al tappo di fine ricerca
      begin
//2017-01-02      initchar := ':https://';                   //fs *****************+ può essere sia http://  che https://
      initchar := 'https://';       //2017-01-04
      endchar  := '/%252';
      Address  := textBetween(dati_album, initchar, endchar);
      if length(address) < 1 then
          begin
          initchar := 'http://';       //2017-01-04
          Address  := textBetween(dati_album, initchar, endchar);
          end;
      initchar := 'https://';       //2017-01-04
      endchar  := '/&sa';         //2017-06-17
      endchar  := '&sa';           //2017-06-17
      Address  := textBetween(dati_album, initchar, endchar);
      if length(address) < 1 then
          begin
          initchar := 'http://';       //2017-01-04
          Address  := textBetween(dati_album, initchar, endchar);
          end;

//      Address  := 'https://' + Address;

      initchar := '<h3 class="r">';
      endchar  := '</h3>';
      Title_album  := textBetween(dati_album,initchar, endchar);              //descrizione del link
      HTMLRemoveTags(Title_album);
      Address := StringReplace(Address, 'en.shop.', 'en.KO');        //il riferimento inglese sarà scartato
      posiz :=  pos('shop.sergiobonelli', Address) + pos('/scheda/', Address);
      if (pos('shop.sergiobonelli', Address) > 0)
        or (pos('/scheda/', Address) > 0) then
          begin
          Address := 'http://' + Address;
          Title_Album := Title_Album + '  (' + copy(Address, 1, 28) + ')';
          PickTreeAdd(Title_Album, Address);
          found := True;
      end;

      init_control := '<div class="g">';    //2016-02-10
      end_control := '<div class="g">';     //2016-02-10
      dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
      lgth_dati_album := length(dati_album);
      delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
      lgth_dati_album := length(Dati_album) - (length(init_control) + length(end_control));
      end;
//   end;

   if not found then
   begin
      ShowMessage('Nessun fumetto trovato per ' + Desc_ricerca);
      exit;
   end;
   if PickTreeExec(Address) then
      begin
      CharAbNormal := 'https://';
      CharNormal :=  '';
//    shop.sergiobonelli.it/tex/2017/04/04/albo/gli-incappucciati-del-klan-1000852/
//      Address := StringReplace(Address, CharAbNormal, CharNormal) + '/';
      AnalyzePageAlbo(Address);                                    // Albo page
      end
   else
      ShowMessage('No serie/film selected');
end;
// ---------------------------------------
end;

//------------------------------------------------------------------------------
// Prendi il primo indirizzo                        da sergiobonelli.it
//------------------------------------------------------------------------------
procedure GetFirst;   
var
   Address, urlmusic, album, albumtest, genre: String;
   Save_address, Title_album, indir_album, indir_title_album: String;
   dati_album, init_control, end_control, init_ol, end_ol: String;
   Desc_ricerca: String;
   ctr_loop, posiz: integer;
   found: boolean;
   i, lgth_dati_album: integer;
begin

   PickTreeClear;                                                    // clear list
   desc_ricerca := file_name;
   if desc_ricerca = '' then
      desc_ricerca := film_o_serieName;
   Address := 'https://www.google.it/search?as_q=' + film_o_serieName;
   Address := Address + '&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all';
   Address := Address + '&as_sitesearch=shop.sergiobonelli.it&as_occt=any&safe=images&as_filetype=&as_rights=';
   Address := UrlEncode(Address);
   Page := GetPage(Address);
   Save_address := Address;
   SetField(fieldURL, Address);
   HTMLdecode(Page);
   if debug_search then
      DumpPage(folder+'googleListPage.html', Page);                // debug

   found := True;
//   PickTreeAdd('List of albums found for' + '"' + AlbumName + '"', '');
   Desc_ricerca := 'Lista articoli trovati per: "' + Desc_ricerca + '"';
   PickTreeAdd(Desc_ricerca, '"');
   if debug_search then
      DumpPage(folder+'discogsList.txt', Page);                // debug
   if Pos('find anything', Page) <> 0 then
   begin
      LogMessage('Error while reading selection page - no results found for ' + Desc_ricerca);
      Found := False;
      exit;
   end;

   init_ol := '<ol>';                        //2015-11-25
   end_ol  := '</ol>';
   pag_ricerca := TextBetween(Page, init_ol, end_ol) + tappo_fine_ricerca + end_ol;  //elenco titoli richiesti e tappo ricerca
   if debug_search then
      DumpPage(folder+'googlericerca.html', pag_ricerca);                // debug

   save_pag := pag_ricerca;
   init_control := '<div class="g">';     //2016-02-10
   end_control := '<div class="g">';      //2016-02-10
   dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
   //dati_album contiene recursivamente i dati delle pagine referenziate da google
   lgth_dati_album := length(dati_album);
   if debug_search then
      DumpPage(folder+'google_dati_album.html', dati_album);                // debug
   delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
   if debug_search then
      DumpPage(folder+'googlericerca-1.html', Pag_ricerca);                // debug

   initchar := 'https://';       //2017-01-04
   endchar  := '/%252';
   Address  := textBetween(dati_album, initchar, endchar);
    if length(address) < 1 then
       begin
       initchar := 'http://';       //2017-01-04
       Address  := textBetween(dati_album, initchar, endchar);
       end;
   initchar := 'https://';       //2017-01-04
   endchar  := '/&sa';         //2017-06-17
   endchar  := '&sa';           //2017-06-17
   Address  := textBetween(dati_album, initchar, endchar);
   if length(address) < 1 then
       begin
       initchar := 'http://';       //2017-01-04
       Address  := textBetween(dati_album, initchar, endchar);
       end;

   Address := StringReplace(Address, 'en.shop.', 'en.KO');        //il riferimento inglese sarà scartato
   posiz :=  pos('shop.sergiobonelli', Address) + pos('/scheda/', Address);
   if (pos('shop.sergiobonelli', Address) > 0)
     or (pos('/scheda/', Address) > 0) then
       begin
         Address := 'http://' + Address;
         PickTreeAdd(Title_Album, Address);
         found := True;
       end;

   if not found then
   begin
      ShowMessage('Nessun fumetto trovato per ' + Desc_ricerca);
      exit;
   end;
   AnalyzePageAlbo(Address);                                    // Albo page
end;

// ***** Beginning of the script *****
begin
      if not CheckVersion(4,2,1) then
      begin
         ShowMessage('This script requires a newer version of Ant Movie Catalog (at least the version 4.2.1)');
         exit;
      end;
// get user's parms (used more than once)
      Fortunato := GetOption('Primo_OK');
      BatchMode := GetOption('Mode');
      if BatchMode = 1 then
          BonelliBatch;
      if BatchMode = 0 then
          BonelliNorm;
end.
Display posts from previous:   
View previous topic :: View next topic  
Post new topic   Reply to topic    www.antp.be Forum Index -> Ant Movie Catalog > Mods All times are GMT + 1 Hour
Page 1 of 1

 
Jump to:  
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum


Powered by phpBB © 2001, 2006 phpBB Group
Template made by antp