[UPD ITA] Wikipedia.it
Posted: 2017-03-11 09:16:54
Code: Select all
(***************************************************
Ant Movie Catalog importation script
www.antp.be/software/moviecatalog/
[Infos]
Authors=Fulvio53s03
Title=wikipedia.it
Description=import film e serie TV
Site=http://www.wikipedia.it
Language=IT
Version=0.2
Requires=4.2.1
Comments=puo' essere usato per ricerca informazioni tramite google (mode=0) oppure per estrarle direttamente da wikipedia.it (mode=1) inserendo l'URL direttamente. |Inserire il termine (Stagione....) nel titolo tradotto rende migliore la ricerca della cover in bing.com per le serie TV
License=* The source code of the script can be used in |* another program only if full credits to Fulvio53s03*
GetInfo=1
RequiresMovies=1
[Options]
Mode=0|0|0=normal mode|1=batch mode (url)
Origine_foto=1|0|0=img da wiki|1=img da bing
[Parameters]
***************************************************)
// TypeSearch=1|1|1=search for exact expression|2=search for any word
// TypeOrder=3|3|1=order by Artist|2=order by film_o_serie Title|3=order by relevance
// needs the following units
// StringUtils7552.pas
//
program wiki;
uses
StringUtils7552;
const
UrlBase = 'https://it.wikipedia.org'; // base url
bing_Base = 'https://www.bing.com/images/search?q=';
debug_search = false; // debug mode on/off su ricerca files
debug_film_o_serie = false; // debug mode on/off su estrazione dati film_o_serie
folder = 'f:\prova\'; // directory where to save files
CRLFspace = CRLF + ' ';
//2016-02-10 tappo_fine_ricerca = '<li class="g">Tappodifinericercadaticongoogle</li><li class="g">';
tappo_fine_ricerca = '<div class="g">Tappodifinericercadaticongoogle</div><div class="g">'; //2016-02-10
MaxItems = 50;
var
nuovo_anno, mese_anno, solo_anno, data_rilascio, label, stagione: String;
Credit_address, Readdress, Page: String;
film_o_serieName, ArtistName, firstcall, name, autori: String;
Formato, Releases, artista: String;
file_name, campo_URL: String;
tipo_ricerca, pag_ricerca, save_value, ordine_lista, Salva_autore: String;
value, episodi, lista_episodi, riquadro: string;
CharNormal, CharAbNormal: string;
film_o_serie_ok: boolean;
BatchMode, origine_foto_opt, giri, anno_int: Integer;
pos_chr, lgth_file_name: Integer;
titolo_e_autore, titolo, autore, durata, cover: String;
save_comments, commento_episodi, commento_skeda: String;
batchlog, confbatch: TstringList;
save_pag, artist, initchar, endchar: string;
film_o_serieok: boolean;
bing_url, stagione_special: string;
//------------------------------------------------------------------------------
// list of film_o_series
//------------------------------------------------------------------------------
procedure GetList;
var
Address, urlmusic, album, albumtest, genre: String;
Save_address, Title_album, indir_album, indir_title_album: String;
dati_album, init_control, end_control, init_ol, end_ol: String;
Desc_ricerca: String;
ctr_loop: integer;
found: boolean;
i, lgth_dati_album: integer;
begin
PickTreeClear; // clear list
desc_ricerca := file_name;
if desc_ricerca = '' then
desc_ricerca := film_o_serieName;
//fs2017.01.30 Address := 'https://www.google.it/search?as_q=' + film_o_serieName;
Address := 'https://www.google.it/search?num=20&as_q=' + film_o_serieName; //fs2017.01.30
Address := Address + '&as_epq=&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr=&as_qdr=all';
Address := Address + '&as_sitesearch=it.wikipedia.org&as_occt=any&safe=images&as_filetype=&as_rights=';
Address := UrlEncode(Address);
Page := GetPage(Address);
Save_address := Address;
SetField(fieldURL, Address);
HTMLdecode(Page);
if debug_search then
DumpPage(folder+'googleListPage.html', Page); // debug
found := True;
// PickTreeAdd('List of albums found for' + '"' + AlbumName + '"', '');
Desc_ricerca := 'Lista articoli trovati per -' + Desc_ricerca + '"';
PickTreeAdd(Desc_ricerca, '"');
if debug_search then
DumpPage(folder+'wikiList.txt', Page); // debug
if Pos('find anything', Page) <> 0 then
begin
LogMessage('Error while reading selection page - no results found for ' + Desc_ricerca);
Found := False;
exit;
end;
init_ol := '<ol>'; //2015-11-25
end_ol := '</ol>';
pag_ricerca := TextBetween(Page, init_ol, end_ol) + tappo_fine_ricerca + end_ol; //elenco titoli richiesti e tappo ricerca
if debug_search then
DumpPage(folder+'wikiricerca.html', pag_ricerca); // debug
save_pag := pag_ricerca;
init_control := '<div class="g">'; //2016-02-10
end_control := '<div class="g">'; //2016-02-10
dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
//dati_album contiene recursivamente i dati delle pagine referenziate da google
lgth_dati_album := length(dati_album);
if debug_search then
DumpPage(folder+'google_dati_album.txt', dati_album); // debug
delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
if debug_search then
DumpPage(folder+'wikiricerca-1.txt', Pag_ricerca); // debug
while (dati_album <> tappo_fine_ricerca) do //estraggo fino al tappo di fine ricerca
begin
//*** ricerca dell'indirizzo da linkare (N.B. prima devi cercare https, poi cercare http
initchar := 'https://'; endchar := '/%252';
Address := textBetween(dati_album, initchar, endchar);
if length(address) < 1 then
begin
initchar := 'https://'; endchar := '&sa';
Address := textBetween(dati_album, initchar, endchar);
end;
if length(address) < 1 then
begin
initchar := 'http://'; endchar := '/&252';
Address := textBetween(dati_album, initchar, endchar);
if length(address) < 1 then
begin
initchar := 'http://'; endchar := '&sa';
Address := textBetween(dati_album, initchar, endchar);
end;
end;
Address := initchar + Address;
//*** fine ricerca indirizzo
initchar := '<h3 class="r">';
endchar := '</h3>';
Title_album := textBetween(dati_album,initchar, endchar); //descrizione del link
HTMLRemoveTags(Title_album);
// Title_album := copy(Title_album, 1, 42);
PickTreeAdd(Title_Album, Address);
found := True;
init_control := '<div class="g">'; //2016-02-10
end_control := '<div class="g">'; //2016-02-10
dati_album := init_control + TextBetween (pag_ricerca, init_control, end_control) + end_control;
lgth_dati_album := length(dati_album);
delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
lgth_dati_album := length(Dati_album) - (length(init_control) + length(end_control));
end;
// end;
if not found then
begin
LogMessage('No album found for ' + Desc_ricerca);
exit;
end;
if PickTreeExec(Address) then
begin
AnalyzeMoviePage(Address); // Album page
end
else
LogMessage('No serie/film selected');
// ---------------------------------------
end;
//------------------------------------------------------------------------------
// ANALYZE Movie PAGE
//------------------------------------------------------------------------------
procedure AnalyzeMoviePage(Address: string);
var
urlmusic, Lenght, Pays: String;
Autore, style, Commenti, str_min, campo_note: String;
minu, minuti, ore, lgth_comm, save_lgth_comm, i: integer;
j: Real;
begin
save_comments := '';
commento_episodi := '';
commento_skeda := '';
estrazione_scheda(Address);
// save_comments := commento_skeda + CRLF + commento_episodi;
// SetField(fieldcomments, save_comments); // commenti + elenco episodi end
end;
//------------------------------------------------------------------------------
// ESTRAE INFORMAZIONI DA SCHEDA
//------------------------------------------------------------------------------
procedure estrazione_scheda(Address: string);
begin
Page := GetPage(Address);
SetField(fieldURL, Address);
Page := UTF8decode(Page);
HTMLdecode(Page);
if debug_film_o_serie then
DumpPage(folder+'wikiPageDetail.txt', Page); // debug_film_o_serie
if (BatchMode = 0) and (tipo_ricerca = 'sk ep') then
SetField(fieldURL, Address);
campo_URL := getfield(fieldurl);
SetField(fieldDate, DateToStr(Date));
Normalizza_Page(Page);
if debug_film_o_serie then
DumpPage(folder+'wikiPageDetailHTMLdecode.txt', Page); // debug_film_o_serie
// Page := UTF8Decode(Page); //fs2015-01-15
film_o_serieok := True;
Value := Page;
//fs 2017.01.28 riquadro := textbetween(Value, '<div id="mw-content-text"', '</div>');
riquadro := textbetween(Value, '<table class="sinottico"', '</table>'); //fs 2017.01.28
if debug_film_o_serie then
DumpPage(folder+'wikiriquadro.txt', riquadro);
//*** titolo tradotto
initchar := '<title>'; //fs2016-06-26
endchar := '</title>';
label := textbetween(Value, initchar, endchar);
label := stringreplace(label, ' - Wikipedia', '');
if getfield (fieldtranslatedTitle) = '' then
SetField(fieldtranslatedTitle, label);
//*** Descrizione (trama)
label := textbetween(value, '<span class="mw-headline" id="Trama">Trama</span>', '<h');
label := textbetween(label, '<p>', '</p>');
// label := textbetween(value, '<h2><span class="mw-headline" id="Trama">', '<h2>');
if label = '' then //2017.03.11 FS
begin
label := textbetween(value, '<span class="mw-headline" id="Descrizione">Descrizione</span>', '<h');
label := textafter(label, '</h2>');
end;
if debug_film_o_serie then
DumpPage(folder+'wikiDescrizione.html', label); // debug_film_o_serie
if label = '' then
begin
label := textafter(value, '</table>');
label := textbefore(label, '<h2>', '');
end;
HTMLRemoveTags(label);
label := fulltrim(label);
SetField(fielddescription, label);
//*** commento (descrizione generale)
initchar := '<p><i><b>';
label := initchar + textbetween(value, initchar, '<div id="toc" class="toc">');
// label := textbetween(label, '<p>', '</p>');
HTMLRemoveTags(label);
SetField(fieldcomments, label);
// *** dati estratti dal riquadro sinottico ***
//*** country
initchar := '>Paese'; endchar := '</tr>';
label := TextBetween(riquadro, initchar, endchar);
HTMLRemoveTags(label);
label := stringReplace(label, 'di produzione', '');
label := stringReplace(label, 'Stati Uniti d''America', 'USA');
label := fulltrim(label);
SetField(fieldcountry, label);
//*** cover locandina da wikipedia ------------------
cover := 'https:' + TextBetween(riquadro, 'src="', '"');
if origine_foto_opt = 0 then
begin
if cover <> 'https:' then
GetPicture(cover);
end;
//*** cover locandina da Bing -------------------
// bing_url := bing_Base + UrlEncode(film_o_serieName) + '&qft=+filterui:imagesize-large+filterui:aspect-tall&FORM=R5IR28';
// delete(Pag_ricerca, 1, (lgth_dati_album - length(end_control)));
pos_chr := 0;
pos_chr := pos('(stagione-)', film_o_serieName);
if pos_chr = 0 then
pos_chr := pos('(stagione-', film_o_serieName);
if pos_chr = 0 then
pos_chr := pos('(stagione', film_o_serieName);
if pos_chr = 0 then
pos_chr := pos(' stagione ', film_o_serieName);
if pos_chr = 0 then
pos_chr := pos(' serie ', film_o_serieName);
lgth_file_name := length(film_o_seriename);
if pos_chr = 0 then
begin
bing_url := bing_Base + UrlEncode('COVER ' + film_o_serieName);
bing_url := bing_url + '&qft=+filterui:imagesize-large+filterui:aspect-tall&FORM=R5IR28';
end;
if pos_chr > 0 then
begin
// bing_url := delete(film_o_seriename, pos_chr, (lgth_file_name - pos_chr));
bing_url := film_o_seriename;
delete(bing_url, (pos_chr - 1), (lgth_file_name - pos_chr + 2));
bing_url := bing_Base + UrlEncode('SERIE ' + bing_url);
bing_url := bing_url + '&qft=+filterui:imagesize-large+filterui:aspect-tall&FORM=R5IR28';
end;
if origine_foto_opt = 1 then
AnalyzeBing(bing_url);
//----------------- fine estrae cover da Bing -------------------
//*** Titolo Originale
label := textbetween(riquadro, '>Titolo originale<', '</td>');
initchar := '<td'; endchar := '</td>';
label := initchar + textbetween(label, initchar, '</i>');
label := FormatText(label);
SetField(fieldoriginalTitle, label);
//*** genere
initchar := '>Genere<';
label := textbetween(riquadro, initchar, '</tr>');
initchar := '<td'; endchar := '</td>';
label := initchar + textbetween(label, initchar, endchar);
HTMLRemoveTags(label);
label := AnsiMixedCase(AnsiLowerCase(label), ' ');
SetField(fieldcategory, label);
//*** regia
initchar := '>Regia</a>'; endchar := '</td>';
label := TextBetween(riquadro, initchar, endchar);
HTMLRemoveTags(label);
label := FormatText(label);
SetField(fieldDirector, label);
//*** personaggi e interpreti
initchar := '<ul>';
endchar := '</ul>';
label := initchar + TextBetween(riquadro, initchar, endchar) + endchar;
initchar := '<li>';
label := initchar + TextBetween(riquadro, initchar, endchar) + endchar;
HTMLRemoveTags(label);
SetField(fieldactors, label);
//*** anno
initchar := '>Anno'; endchar := '</a>';
label := TextBetween(riquadro, initchar, endchar);
label := stringReplace(label, ' - in produzione', '');
HTMLRemoveTags(label);
// label := FormatText(label);
SetField(fieldyear, label);
label := getfield(fieldyear);
//*** produttore
initchar := '>Casa di produzione<'; endchar:= '</tr>';
label := textbetween(riquadro, initchar, endchar);
label := '<' + stringReplace(label, '<br />', ', ');
HTMLRemoveTags(label); // label := stringReplace(label, ' - in produzione', '');
SetField(fieldproducer, label);
//*** writer (ideatore della serie TV) - soggetto (film)
initchar := '>Soggetto'; //film
label := textbetween(value, initchar, '</tr>');
// label := textbetween(label, '<td>', '</td>');
HTMLRemoveTags(label);
SetField(fieldwriter, label);
if label = '' then
begin
initchar := '<th>Ideatore</th>'; //serie
label := textbetween(value, initchar, '</tr>');
// label := textbetween(label, '<td>', '</td>');
HTMLRemoveTags(label);
SetField(fieldwriter, label);
end;
//*** compositore
initchar := '>Musiche'; endchar:= '</tr>';
label := textbetween(riquadro, initchar, endchar);
label := '<' + stringReplace(label, '<br />', ', ');
HTMLRemoveTags(label); // label := stringReplace(label, ' - in produzione', '');
SetField(fieldcomposer, label);
end;
Procedure Normalizza_page(Pagina: string); // elimina i crlf, trasforma delimiters maiuscoli in minuscoli
begin
pagina := RegExprSetReplace('<([^>]+)>', pagina, '<\L\1>', true);
CharAbNormal := crlf; CharNormal := ' ';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '<B'; CharNormal := '<b';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '</B'; CharNormal := '</b';
pagina := StringReplace(Value, CharAbNormal, CharNormal);
CharAbNormal := '<FONT'; CharNormal := '<font';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '</FONT'; CharNormal := '</font';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '<TR'; CharNormal := '<tr';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '</TR'; CharNormal := '</tr';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '<TD'; CharNormal := '<td';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '</TD'; CharNormal := '</td';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '<DIV'; CharNormal := '<div';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '</DIV'; CharNormal := '</div';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '<Ol'; CharNormal := '<ol';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
CharAbNormal := '</Ol'; CharNormal := '</ol';
pagina := StringReplace(pagina, CharAbNormal, CharNormal);
end;
//------------------------------------------------------------------------------
// set show warning (normal mode) or add to log (batch mode)
//------------------------------------------------------------------------------
procedure LogMessage(m: string);
begin
if BatchMode > 0 then
AddToLog('item '+GetField(fieldNumber)+': '+m)
else
ShowWarning(m);
end;
//------------------------------------------------------------------------------
// add a message in the batch log and save to disk
// (because I don't know when it's finished...)
//------------------------------------------------------------------------------
procedure AddToLog(m: string);
begin
//fs2016-12-31 batchlog.Add(m);
//fs2016-12-31 batchlog.SaveToFile(batchlogfic);
end;
//------------------------------------------------------------------------------
// process batch mode
//------------------------------------------------------------------------------
procedure wikiBatch;
begin
film_o_serieName := GetField(fieldUrl); // if no url or another site then ignore
if (film_o_serieName <> '') and (Pos(UrlBase, film_o_serieName) > 0) then
AnalyzeMoviePage(film_o_serieName)
else
LogMessage('ignored url="'+film_o_serieName+'"');
end;
//------------------------------------------------------------------------------
// process normal mode
//------------------------------------------------------------------------------
procedure wikiNorm;
begin
file_name:= getField(fieldfilepath);
if (GetField(fieldTranslatedTitle) <> '') then
film_o_serieName := GetField(fieldTranslatedTitle) // get film_o_serie name
else
film_o_serieName := GetField(fieldOriginalTitle); //mrobama
repeat
if not Input('wiki = ' + ArtistName + ' - ' + film_o_serieName, 'Scrivi il nome del film o della serie:'
+ crlf, film_o_serieName) or (film_o_serieName = '') then exit;
formato := getfield(FieldOriginalTitle);
if pos(formato, 'serie') > 0 then
formato:= 'serie';
GetList;
until film_o_serieok;
end;
//---------------------------- estrazione immagine da Bing ------------------------
// ---
function RemovePar(wholetext: string) : string;
var
str1: String;
i: Integer;
begin
str1 := Trim(TextBefore(wholetext, '(', ''));
if str1 <> '' then
begin
if Pos(')', RemainingText) > 0 then
wholetext := str1+' '+Trim(TextAfter(RemainingText, ')')); // + end of text or ''
end;
result := Trim(wholetext);
end;
// ---
procedure AnalyzeBing(bing_Address: string);
var
ImgCnt: Integer;
PageText,strRef,strImg,strTest, Title_photo: string;
begin
// Read the whole page
PageText := GetPage(bing_Address);
if debug_search then
DumpPage(folder+'bingSearchPage.html', PageText); // debug
initchar := '<a class="thumb"'; endchar := '<div class="fileInfo">';
strTest := initchar + TextBetween(PageText, initchar, endchar) + endchar;
if (strTest = '') then
break;
// Check if there is a class with the url inside this href
if pos('class="thumb"', strTest) > 0 then //class="thumb"
begin
// Get the img reference
initchar := 'href="'; endchar := '"';
strImg := TextBetween(strTest, initchar, endchar);
initchar := '<div class="des">'; endchar := '</div>';
Title_photo := Textbetween(strtest, initchar, endchar);
Title_photo:= FormatText(Title_photo);
GetPicture (strImg);
end;
end;
//------------------------------------------------------------------------------
//------------------------------------------------------------------------------
// start here
//------------------------------------------------------------------------------
begin
if not CheckVersion(4,2,1) then
begin
ShowMessage('This script requires a newer version of Ant Movie Catalog (at least the version 4.2.1)');
exit;
end;
// get user's parms (used more than once)
BatchMode := GetOption('Mode');
origine_foto_opt := GetOption('Origine_foto');
// *********************************************************************
if BatchMode = 1 then
wikiBatch;
if BatchMode = 0 then
wikiNorm;
end.