2016-08-27 3 views

Ich schreibe einen einfachen Web-Scraper, der Posttitel, Benutzernamen und die letzte Post-Zeit aus einem Forum holt.Forum title web scraper

Das Problem ist, dass der Schaber nur den letzten letzten Eintrag in der Tabelle zieht.

Zum Beispiel: Wenn die Tabelle auf diese Weise strukturiert ist:

<tr class=""> 
    <td class="title">  
    <a href="/forums/marketplace/8827" title="View full post details">Title number 1</a> 
    <td class="author"><a href="/members/pursu" title="View member, pursu">pursu</a></td> 
    <td class="count">0</td> 
    <td class="last_post">9 minutes ago</td> 
<tr class="color2"> 
    <td class="title"> 

    <a href="/forums/marketplace/8826" title="View full post details">Title number 2</a> 
    <td class="author"><a href="/members/colinatx" title="View member, colinatx">colinatx</a></td> 
    <td class="count">0</td> 
    <td class="last_post">9 minutes ago</td> 
<tr class=""> 
    <td class="title">  
    <a href="/forums/marketplace/8785" title="View full post details">Title number 3</a> 
    <td class="author"><a href="/members/Object117" title="View member, Object117">Object117</a></td> 
    <td class="count">11</td> 
    <td class="last_post">about 1 hour ago</td> 

Das Ergebnis, das in die .json Ausgabedatei geschrieben wird, ist dies

    "title": "Title number 3", 
    "author": "Object117", 
    "lastpost": "about 1 hour ago" 

Statt es sollte so sein:

    "title": "Title number 1", 
    "author": "pursu", 
    "lastpost": "9 minutes ago" 
    "title": "Title number 2", 
    "author": "colinatx", 
    "lastpost": "9 minutes ago" 
    "title": "Title number 3", 
    "author": "Object117", 
    "lastpost": "about 1 hour ago" 

Mein JavaScript:

var express = require('express'); 
var fs = require('fs'); 
var request = require('request'); 
var cheerio = require('cheerio'); 
var app  = express(); 

app.get('/scrape', function(req, res){ 

    //This is the URL to pull data from 
    url = 'http://www.pedalroom.com/forums/marketplace'; 

    // The first parameter is our URL 

    // The callback function takes 3 parameters, an error, response status code and the html 
    request(url, function(error, response, html){ 

       //pulling HTML 
      var $ = cheerio.load(html); 

       //Variables that capture data 
      var title, author, lastpost; 
      var json = { title : "", author : "", lastpost : ""}; 


       var data = $(this); 

       title = data.children().first().text(); 

       json.title = title; 

       var data = $(this); 

       author = data.children().first().text(); 

       json.author = author; 

       var data = $(this); 

       lastpost = data.text(); 

       json.lastpost = lastpost; 
     fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){ 

      console.log('File successfully written! - Check your project directory for the output.json file'); 


     // Finally, we'll just send out a message to the browser reminding you that this app does not have a UI. 
     res.send('Check your console!') 


console.log('Magic happens on port 8081'); 
exports = module.exports = app; 

Ist es, dass ich den Code oder vielleicht etwas anderes irgendwie Schleife brauchen?



In Ihrem Code fangen Sie nur das erste Element der ersten Zeile, weil Sie nicht in jeder Zeile eine Schleife bilden. Hier

ist der Arbeitscode:

var express = require('express'); 
var fs = require('fs'); 
var request = require('request'); 
var cheerio = require('cheerio'); 
var app  = express(); 

app.get('/scrape', function(req, res){ 

    //This is the URL to pull data from 
    url = 'http://www.pedalroom.com/forums/marketplace'; 

    // The first parameter is our URL 

    // The callback function takes 3 parameters, an error, response status code and the html 
    request(url, function(error, response, html){ 

      //pulling HTML 
      var $ = cheerio.load(html); 

      var data = []; 

      * New code starts here 
      // For each row of the table 
      $('.topics tr').each(function(index, element){ 

       // If title is present on this line, write it into the json 
       if($(this).find('.title a').length > 0) 
         title: $(this).find('.title a').html(), 
         author: $(this).find('.author a').html(), 
         lastpost: $(this).find('.last_post').html() 
      * Ends here :D 
     fs.writeFile('output.json', JSON.stringify(data, null, 4), function(err){ 

      console.log('File successfully written! - Check your project directory for the output.json file'); 


     // Finally, we'll just send out a message to the browser reminding you that this app does not have a UI. 
     res.send('Check your console!') 


console.log('Magic happens on port 8081'); 
exports = module.exports = app;