PovCrawler.java

Go to the documentation of this file.
00001 package org.net2map.pov.documentsManager;
00002 
00003 //import websphinx.*;
00004 
00005 import java.beans.XMLEncoder;
00006 import java.beans.XMLDecoder;
00007 
00008 import java.io.*;
00009 
00010 import java.net.URL;
00011 import java.net.MalformedURLException;
00012 
00013 /*import net.matuschek.spider.WebRobot;
00014 import net.matuschek.http.URLLogger;
00015 import net.matuschek.http.HttpDocToFile;*/
00016 
00017 import websphinx.*;
00018 
00019 public class PovCrawler
00020 {
00021     static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger( PovCrawler.class.getName() );
00022     static PovUrlsSet usUrls, usNews;
00023     //PovUrlsSet htUrlsToCrawl;
00024     int nCurrentLevel = 0;
00025     PovIndexer pIndexer;
00026     String sMirrorDirectory = "";
00027 
00028     public PovCrawler( PovIndexer indexer, String mirrordirectory )
00029     {
00030         logger.setLevel( ( org.apache.log4j.Level )org.apache.log4j.Priority.WARN );
00031         pIndexer = indexer;
00032         sMirrorDirectory = mirrordirectory;
00033         usUrls = new PovUrlsSet();
00034     }
00035     
00036     public void crawl( PovUrl root, boolean create, boolean filecontinue )
00037     {
00038         boolean bCreate = create;
00039         boolean bInit = ( usUrls.size() == 0 );
00040         
00041         PovUrl url = root;
00042         
00043         //chargement de la liste du fichier
00044         if ( filecontinue )
00045         {
00046             if ( usUrls.size() == 0 )
00047             {
00048                 try
00049                 {
00050                     usUrls = (PovUrlsSet)load( "urls.dat" );
00051                     url = usUrls.nextUrl();
00052                 }
00053                 catch ( Exception e )
00054                 {
00055                     logger.warn( e.getMessage() );
00056                 }
00057             }
00058         }
00059         
00060         //if ( url.getUrl() != "" )
00061         if ( url != null )
00062         {
00063             logger.warn( "Launch crawl with url " + url.toString() );
00064             //création de la liste de nouvelles url
00065             sphinx( url );
00066             //logger.warn( usNews.size() + " nouvelles url à traiter");
00067             //logger.warn( usUrls.size() + " url courantes au total");
00068             //merge des nouvelles url dans la liste de base
00069             PovUrlsSet usMerged = usUrls.merge( usNews );
00070             
00071             if ( bInit )
00072             {
00073                 if ( usMerged.get( url.getUrl() ) == null )
00074                 {
00075                     usMerged.put( url.getUrl(),  url );
00076                 }
00077                 else
00078                 {
00079                     ((PovUrl)usMerged.get( url.getUrl() ) ).setDepth( url.getDepth() );
00080                 }
00081             }
00082             
00083             logger.warn( usMerged.size() + " url à indexer");
00084             logger.warn( usUrls.size() + " url courantes au total");
00085             //logger.warn( usUrls.toString() );
00086             //Indexation de la liste de nouvelles url
00087             try
00088             {
00089                 //pIndexer.indexDocs( usNews, bCreate );
00090                 //logger.warn( usMerged.toString() );
00091                 if ( usMerged.size() > 0 )
00092                 {
00093                     pIndexer.indexDocs( usMerged, bCreate );
00094                 }
00095                 if ( create = true ) bCreate = false;
00096             }
00097             catch ( Exception e )
00098             {
00099                 logger.warn( e.getMessage() );
00100             }
00101             //Mise à jour url root (a indexer => indexée)
00102             ( (PovUrl)usUrls.get( url.getUrl() ) ).setState( PovUrl.STATE_VISITED );
00103             //Sauvegarde du la nouvelle liste d'url
00104             try
00105             {
00106                 save( usUrls, "urls.dat" );
00107             }
00108             catch ( Exception e )
00109             {
00110                 logger.warn( e.getMessage() );
00111             }
00112             //Lancement de la prochaine indexation
00113             //PovUrl next = usUrls.nextUrl();
00114             usMerged = null;
00115             System.gc();
00116             /*if ( Runtime.getRuntime().freeMemory() > 1000000 )
00117                 crawl( usUrls.nextUrl(), bCreate, true );*/
00118             org.net2map.pov.indexer.IndexerTask.getInstance().setCurrentUrl(usUrls.nextUrl());
00119         }
00120         else
00121         {
00122             logger.warn( "Url is empty" );
00123         }
00124     }
00125     
00126     public void addCachedUrl( PovUrl url )
00127     {
00128         //logger.warn( " add cached " + url.toString() );
00129         usNews.put( url.getUrl(), url );
00130     }
00131     
00132     private void sphinx( PovUrl url )
00133     {
00134         Crawler theCrawler = new Crawler();
00135         DownloadParameters theDownloadParameters = new DownloadParameters();
00136         theDownloadParameters.changeDownloadTimeout(5);
00137         theDownloadParameters.changeMaxThreads(10);
00138         theDownloadParameters.changeObeyRobotExclusion(true);
00139         
00140         try
00141         {
00142             theCrawler.setRoot ( new Link( new URL( url.getUrl() ) ) );
00143         }
00144         catch ( Exception e )
00145         {
00146             logger.warn( e.getMessage() );
00147         }
00148         theCrawler.setDownloadParameters( theDownloadParameters );
00149         theCrawler.setDepthFirst( false );
00150         theCrawler.setMaxDepth( 1 );
00151         //theCrawler.setAction( new SphinxMirrorAction( mirrordir, true, this, create ) );
00152         
00153         //PovUrlsSet usUrls = new PovUrlsSet();
00154         //SphinxMirrorAction theSMA = new SphinxMirrorAction( sMirrorDirectory, this, url );
00155         theCrawler.setAction( new SphinxMirrorAction( sMirrorDirectory, this, url ) );
00156         //SphinxEventLog theEventLog = new SphinxEventLog();
00157         //theCrawler.addCrawlListener( theEventLog );
00158         //theCrawler.addLinkListener( theEventLog );
00159         
00160         usNews = new PovUrlsSet();
00161         theCrawler.run();
00162         
00163         while ( theCrawler.getState() != CrawlEvent.STOPPED )
00164         {
00165         }
00166         
00167         theCrawler.clear();
00168         theCrawler = null;
00169         logger.warn( "Finished url " + url.getUrl() );
00170         url.setState( url.STATE_VISITED );
00171         
00172         //System.out.println( usUrls );
00173         //System.out.println( theSMA.getUrls().toString() );
00174         //pIndexer.stop();
00175         //return usUrls;
00176         
00177         /*WebRobot robby = new WebRobot();
00178         try
00179         {
00180             robby.setStartURL( new URL( url.getUrl() ) );
00181             robby.setMaxDepth( 1 );
00182             robby.setSleepTime( 0 );
00183 
00184             robby.setAllowWholeHost( true );
00185             robby.setWalkToOtherHosts( true );
00186             robby.setAllowCaching( true );
00187             FileWriter logfile = new FileWriter( "urls.txt" );
00188             URLLogger log = new URLLogger( logfile );
00189             //HttpDocToFile theHttpdtf = new HttpDocToFile( "/home/alain/mirror/" );
00190             //robby.setDocManager(theHttpdtf);
00191             robby.setDocManager( log );
00192             robby.run();
00193             logfile.close();        
00194         }
00195         catch ( Exception e )
00196         {
00197             System.out.println( e );
00198         }*/
00199         
00200     }
00201 
00202     public void save( PovUrlsSet us, String filename ) throws Exception
00203     {
00204         /*try
00205         {
00206             FileWriter file = new FileWriter( filename );
00207             for ( Enumeration e = elements() ; e.hasMoreElements() ;)
00208             {
00209                 file.write( (PovUrl)e.nextElement() + "\n" );
00210             }
00211             file.close();
00212         }
00213         catch ( Exception e )
00214         {
00215             logger.warn( e.getMessage() );
00216         }*/
00217         try
00218         {
00219             ObjectOutputStream out = new ObjectOutputStream( new FileOutputStream( filename ) );
00220             out.writeObject( us );
00221             out.close();
00222             logger.warn( "File " + filename + " saved" );
00223         }
00224         catch ( Exception e )
00225         {
00226             logger.warn( e.getMessage() );
00227             throw e;
00228         }
00229     }
00230     
00231     public PovUrlsSet load( String filename )  throws Exception
00232     {
00233         PovUrlsSet result;
00234         try
00235         {
00236             ObjectInputStream in = new ObjectInputStream( new FileInputStream( filename ) );
00237             result = (PovUrlsSet)in.readObject();
00238             in.close();
00239             logger.warn( "File " + filename + " load" );
00240         }
00241         catch ( Exception e )
00242         {
00243             logger.warn( e.getMessage() );
00244             throw e;
00245         }
00246         return result;
00247     }
00248     
00249     public PovUrl nextUrl()
00250     {
00251         if ( usUrls != null )
00252         {
00253             return usUrls.nextUrl();
00254         }
00255         return null;
00256     }
00257 }
Accueil | Téléchargement | Manuel | Doc. technique | Sources CVS | Faq | Nous contacter
©2003 - All Rights Reserved