![]() |
|||
PovCrawler.javaGo to the documentation of this file.00001 package org.net2map.pov.documentsManager; 00002 00003 //import websphinx.*; 00004 00005 import java.beans.XMLEncoder; 00006 import java.beans.XMLDecoder; 00007 00008 import java.io.*; 00009 00010 import java.net.URL; 00011 import java.net.MalformedURLException; 00012 00013 /*import net.matuschek.spider.WebRobot; 00014 import net.matuschek.http.URLLogger; 00015 import net.matuschek.http.HttpDocToFile;*/ 00016 00017 import websphinx.*; 00018 00019 public class PovCrawler 00020 { 00021 static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger( PovCrawler.class.getName() ); 00022 static PovUrlsSet usUrls, usNews; 00023 //PovUrlsSet htUrlsToCrawl; 00024 int nCurrentLevel = 0; 00025 PovIndexer pIndexer; 00026 String sMirrorDirectory = ""; 00027 00028 public PovCrawler( PovIndexer indexer, String mirrordirectory ) 00029 { 00030 logger.setLevel( ( org.apache.log4j.Level )org.apache.log4j.Priority.WARN ); 00031 pIndexer = indexer; 00032 sMirrorDirectory = mirrordirectory; 00033 usUrls = new PovUrlsSet(); 00034 } 00035 00036 public void crawl( PovUrl root, boolean create, boolean filecontinue ) 00037 { 00038 boolean bCreate = create; 00039 boolean bInit = ( usUrls.size() == 0 ); 00040 00041 PovUrl url = root; 00042 00043 //chargement de la liste du fichier 00044 if ( filecontinue ) 00045 { 00046 if ( usUrls.size() == 0 ) 00047 { 00048 try 00049 { 00050 usUrls = (PovUrlsSet)load( "urls.dat" ); 00051 url = usUrls.nextUrl(); 00052 } 00053 catch ( Exception e ) 00054 { 00055 logger.warn( e.getMessage() ); 00056 } 00057 } 00058 } 00059 00060 //if ( url.getUrl() != "" ) 00061 if ( url != null ) 00062 { 00063 logger.warn( "Launch crawl with url " + url.toString() ); 00064 //création de la liste de nouvelles url 00065 sphinx( url ); 00066 //logger.warn( usNews.size() + " nouvelles url à traiter"); 00067 //logger.warn( usUrls.size() + " url courantes au total"); 00068 //merge des nouvelles url dans la liste de base 00069 PovUrlsSet usMerged = usUrls.merge( usNews ); 00070 00071 if ( bInit ) 00072 { 00073 if ( usMerged.get( url.getUrl() ) == null ) 00074 { 00075 usMerged.put( url.getUrl(), url ); 00076 } 00077 else 00078 { 00079 ((PovUrl)usMerged.get( url.getUrl() ) ).setDepth( url.getDepth() ); 00080 } 00081 } 00082 00083 logger.warn( usMerged.size() + " url à indexer"); 00084 logger.warn( usUrls.size() + " url courantes au total"); 00085 //logger.warn( usUrls.toString() ); 00086 //Indexation de la liste de nouvelles url 00087 try 00088 { 00089 //pIndexer.indexDocs( usNews, bCreate ); 00090 //logger.warn( usMerged.toString() ); 00091 if ( usMerged.size() > 0 ) 00092 { 00093 pIndexer.indexDocs( usMerged, bCreate ); 00094 } 00095 if ( create = true ) bCreate = false; 00096 } 00097 catch ( Exception e ) 00098 { 00099 logger.warn( e.getMessage() ); 00100 } 00101 //Mise à jour url root (a indexer => indexée) 00102 ( (PovUrl)usUrls.get( url.getUrl() ) ).setState( PovUrl.STATE_VISITED ); 00103 //Sauvegarde du la nouvelle liste d'url 00104 try 00105 { 00106 save( usUrls, "urls.dat" ); 00107 } 00108 catch ( Exception e ) 00109 { 00110 logger.warn( e.getMessage() ); 00111 } 00112 //Lancement de la prochaine indexation 00113 //PovUrl next = usUrls.nextUrl(); 00114 usMerged = null; 00115 System.gc(); 00116 /*if ( Runtime.getRuntime().freeMemory() > 1000000 ) 00117 crawl( usUrls.nextUrl(), bCreate, true );*/ 00118 org.net2map.pov.indexer.IndexerTask.getInstance().setCurrentUrl(usUrls.nextUrl()); 00119 } 00120 else 00121 { 00122 logger.warn( "Url is empty" ); 00123 } 00124 } 00125 00126 public void addCachedUrl( PovUrl url ) 00127 { 00128 //logger.warn( " add cached " + url.toString() ); 00129 usNews.put( url.getUrl(), url ); 00130 } 00131 00132 private void sphinx( PovUrl url ) 00133 { 00134 Crawler theCrawler = new Crawler(); 00135 DownloadParameters theDownloadParameters = new DownloadParameters(); 00136 theDownloadParameters.changeDownloadTimeout(5); 00137 theDownloadParameters.changeMaxThreads(10); 00138 theDownloadParameters.changeObeyRobotExclusion(true); 00139 00140 try 00141 { 00142 theCrawler.setRoot ( new Link( new URL( url.getUrl() ) ) ); 00143 } 00144 catch ( Exception e ) 00145 { 00146 logger.warn( e.getMessage() ); 00147 } 00148 theCrawler.setDownloadParameters( theDownloadParameters ); 00149 theCrawler.setDepthFirst( false ); 00150 theCrawler.setMaxDepth( 1 ); 00151 //theCrawler.setAction( new SphinxMirrorAction( mirrordir, true, this, create ) ); 00152 00153 //PovUrlsSet usUrls = new PovUrlsSet(); 00154 //SphinxMirrorAction theSMA = new SphinxMirrorAction( sMirrorDirectory, this, url ); 00155 theCrawler.setAction( new SphinxMirrorAction( sMirrorDirectory, this, url ) ); 00156 //SphinxEventLog theEventLog = new SphinxEventLog(); 00157 //theCrawler.addCrawlListener( theEventLog ); 00158 //theCrawler.addLinkListener( theEventLog ); 00159 00160 usNews = new PovUrlsSet(); 00161 theCrawler.run(); 00162 00163 while ( theCrawler.getState() != CrawlEvent.STOPPED ) 00164 { 00165 } 00166 00167 theCrawler.clear(); 00168 theCrawler = null; 00169 logger.warn( "Finished url " + url.getUrl() ); 00170 url.setState( url.STATE_VISITED ); 00171 00172 //System.out.println( usUrls ); 00173 //System.out.println( theSMA.getUrls().toString() ); 00174 //pIndexer.stop(); 00175 //return usUrls; 00176 00177 /*WebRobot robby = new WebRobot(); 00178 try 00179 { 00180 robby.setStartURL( new URL( url.getUrl() ) ); 00181 robby.setMaxDepth( 1 ); 00182 robby.setSleepTime( 0 ); 00183 00184 robby.setAllowWholeHost( true ); 00185 robby.setWalkToOtherHosts( true ); 00186 robby.setAllowCaching( true ); 00187 FileWriter logfile = new FileWriter( "urls.txt" ); 00188 URLLogger log = new URLLogger( logfile ); 00189 //HttpDocToFile theHttpdtf = new HttpDocToFile( "/home/alain/mirror/" ); 00190 //robby.setDocManager(theHttpdtf); 00191 robby.setDocManager( log ); 00192 robby.run(); 00193 logfile.close(); 00194 } 00195 catch ( Exception e ) 00196 { 00197 System.out.println( e ); 00198 }*/ 00199 00200 } 00201 00202 public void save( PovUrlsSet us, String filename ) throws Exception 00203 { 00204 /*try 00205 { 00206 FileWriter file = new FileWriter( filename ); 00207 for ( Enumeration e = elements() ; e.hasMoreElements() ;) 00208 { 00209 file.write( (PovUrl)e.nextElement() + "\n" ); 00210 } 00211 file.close(); 00212 } 00213 catch ( Exception e ) 00214 { 00215 logger.warn( e.getMessage() ); 00216 }*/ 00217 try 00218 { 00219 ObjectOutputStream out = new ObjectOutputStream( new FileOutputStream( filename ) ); 00220 out.writeObject( us ); 00221 out.close(); 00222 logger.warn( "File " + filename + " saved" ); 00223 } 00224 catch ( Exception e ) 00225 { 00226 logger.warn( e.getMessage() ); 00227 throw e; 00228 } 00229 } 00230 00231 public PovUrlsSet load( String filename ) throws Exception 00232 { 00233 PovUrlsSet result; 00234 try 00235 { 00236 ObjectInputStream in = new ObjectInputStream( new FileInputStream( filename ) ); 00237 result = (PovUrlsSet)in.readObject(); 00238 in.close(); 00239 logger.warn( "File " + filename + " load" ); 00240 } 00241 catch ( Exception e ) 00242 { 00243 logger.warn( e.getMessage() ); 00244 throw e; 00245 } 00246 return result; 00247 } 00248 00249 public PovUrl nextUrl() 00250 { 00251 if ( usUrls != null ) 00252 { 00253 return usUrls.nextUrl(); 00254 } 00255 return null; 00256 } 00257 } |
|||
|
Accueil | Téléchargement | Manuel
| Doc. technique | Sources CVS |
Faq | Nous contacter
©2003 - All Rights Reserved |
|||