2016-08-31 5 views
0

Der folgende Code funktioniert gut, aber es nur Daten von einer Seite abrufen, kann jemand vorschlagen, wie Seitenumbruch zu behandeln, so dass Code Daten von der ersten Seite dann zweiter und dann bis zum letzten abrufen kann SeitennummerCrawlen einer Website mit mehreren Seiten mit Java

import java.io.FileOutputStream; 
import java.io.IOException; 
import java.util.ArrayList; 

import org.apache.poi.hssf.usermodel.HSSFRow; 
import org.apache.poi.hssf.usermodel.HSSFSheet; 
import org.apache.poi.hssf.usermodel.HSSFWorkbook; 
import org.apache.poi.ss.usermodel.Row; 

import com.webscrap4j.WebScrap; 
import com.webscrap4j.WebScrapException; 


public class okayapower_battery { 
    @SuppressWarnings("resource") 
    public static void main(String[] args) throws IOException { 



     ArrayList<String> al = new ArrayList<String>(); 
     ArrayList<String> bl = new ArrayList<String>(); 
     ArrayList<String> cl = new ArrayList<String>(); 
     WebScrap ws = new WebScrap(); 


     ws.setUrl("https://www.okayapower.com/product-category/inverter/"); 
     try 


     { 

      ws.startWebScrap(); 
      //al = ws.getImageTagData("img", "title"); 
      al = ws.getSingleHTMLScriptData("<h3>", "</h3>"); 
      bl = ws.getSingleHTMLScriptData("<del>", "</del>"); 
      cl = ws.getSingleHTMLScriptData("<ins>", "</ins>"); 




      HSSFWorkbook workBook = new HSSFWorkbook(); 
      FileOutputStream fos = new FileOutputStream("/Users/parthpatil/Documents/Abm Technologies/Crawl/okaya_battery.xls"); { 

      // Create the Sheet 
      HSSFSheet Sheet = workBook.createSheet("products"); 

      // Create the first row corresponding to the header 
      Row header = Sheet.createRow(0); 
      header.createCell(0).setCellValue("Product Name"); 
      header.createCell(1).setCellValue("Product Price"); 
      header.createCell(2).setCellValue("Product MRP"); 

      // Ensure that all the List have the same size otherwise throw an exception 
      if (al.size() != bl.size() || al.size() != cl.size()) 
       throw new IllegalStateException("Some data is missing"); 

      // Iterate over all the list an create the rows of data 
      for(int i = 0; i < al.size(); i++){ 
       // Create the current starting from 1 to al.size() 
       HSSFRow row = Sheet.createRow((short) i + 1); 
       // Cell of the Product Name 
       row.createCell(0).setCellValue(al.get(i)); 
       // Cell of the Product Price 
       row.createCell(1).setCellValue(cl.get(i)); 
       // Cell of the Product MRP 
       row.createCell(2).setCellValue(bl.get(i)); 
      } 
      // Write the result into the file 
      workBook.write(fos); 


      for (String adata : al) 
      { 

       System.out.println("the product are:- " + adata); 

      } 
      for (String bdata : bl) 
      { 

       System.out.println("the MRp are:- " + bdata); 

      } 
      for (String cdata : cl) 
      { 

       System.out.println("the selling price is:- " + cdata); 

      } 


     } 
      }catch (WebScrapException e) { 
      // TODO Auto-generated catch block 
      e.printStackTrace(); 
     } 
     } 

} 
+0

können Sie die Seitennummerierung verarbeiten? –

+0

Möchten Sie zu 'https: // www.okayapower.com/product-category/inverter /' und dann zu 'https: // www.okayapower.com/product-category/inverter/page/2 /' oder navigieren Sie möchten alle Links analysieren, die die Seite enthält? – Defozo

+0

Ja ich möchte von Seite eins zu zwei und bis zum Ende navigieren –

Antwort

0

fügen Sie einfach einfache for Schleife wie for(int page=1; page<=10; page++) { (wo 10 die letzte Seite ist, dass Sie analysieren wollen) kurz vor ws.setUrl(... die auch zu ws.setUrl("https://www.okayapower.com/product-category/inver‌​ter/page/" + String.valueOf(page) + "/");

der resultierende Code so sollte geändert werden sollte wie folgt aussehen :

import java.io.FileOutputStream; 
import java.io.IOException; 
import java.util.ArrayList; 

import org.apache.poi.hssf.usermodel.HSSFRow; 
import org.apache.poi.hssf.usermodel.HSSFSheet; 
import org.apache.poi.hssf.usermodel.HSSFWorkbook; 
import org.apache.poi.ss.usermodel.Row; 

import com.webscrap4j.WebScrap; 
import com.webscrap4j.WebScrapException; 


public class okayapower_battery { 
    @SuppressWarnings("resource") 
    public static void main(String[] args) throws IOException { 

     ArrayList<String> al = new ArrayList<String>(); 
     ArrayList<String> bl = new ArrayList<String>(); 
     ArrayList<String> cl = new ArrayList<String>(); 
     WebScrap ws = new WebScrap(); 
     for(int page=1; page<=10; page++) { 
      ws.setUrl("https://www.okayapower.com/product-category/inverter/page/" + String.valueOf(page) + "/"); 
      try { 
       ws.startWebScrap(); 
       //al = ws.getImageTagData("img", "title"); 
       al = ws.getSingleHTMLScriptData("<h3>", "</h3>"); 
       bl = ws.getSingleHTMLScriptData("<del>", "</del>"); 
       cl = ws.getSingleHTMLScriptData("<ins>", "</ins>"); 

       HSSFWorkbook workBook = new HSSFWorkbook(); 
       FileOutputStream fos = new FileOutputStream("/Users/parthpatil/Documents/Abm Technologies/Crawl/okaya_battery.xls"); { 

        // Create the Sheet 
        HSSFSheet Sheet = workBook.createSheet("products"); 

        // Create the first row corresponding to the header 
        Row header = Sheet.createRow(0); 
        header.createCell(0).setCellValue("Product Name"); 
        header.createCell(1).setCellValue("Product Price"); 
        header.createCell(2).setCellValue("Product MRP"); 

        // Ensure that all the List have the same size otherwise throw an exception 
        if (al.size() != bl.size() || al.size() != cl.size()) 
         throw new IllegalStateException("Some data is missing"); 

        // Iterate over all the list an create the rows of data 
        for(int i = 0; i < al.size(); i++){ 
         // Create the current starting from 1 to al.size() 
         HSSFRow row = Sheet.createRow((short) i + 1); 
         // Cell of the Product Name 
         row.createCell(0).setCellValue(al.get(i)); 
         // Cell of the Product Price 
         row.createCell(1).setCellValue(cl.get(i)); 
         // Cell of the Product MRP 
         row.createCell(2).setCellValue(bl.get(i)); 
        } 
        // Write the result into the file 
        workBook.write(fos); 

        for (String adata : al) 
        { 
         System.out.println("the product are:- " + adata); 
        } 
        for (String bdata : bl) 
        { 
         System.out.println("the MRp are:- " + bdata); 
        } 
        for (String cdata : cl) 
        { 
         System.out.println("the selling price is:- " + cdata); 
        } 
       } 
      } catch (WebScrapException e) { 
       // TODO Auto-generated catch block 
       e.printStackTrace(); 
      } 
     } 
    } 
}