2017-05-22 3 views
0

Ich ziehe Daten von Twitter, Filterung, einen Generator und versuchen, Bulk-Index mit Helfer in elasticsearch jedoch bekomme ich den folgenden Fehler, die ich nicht genau wo extrahieren kann das Problem ist.Indexfehler bei der Elastic-Suche Python

Traceback (most recent call last): 
    File "/Users/aqm1152/_acert_/basic/test_collection_dump.py", line 245, in <module> 
    sinceid, complete, api_counter, maxid = search_tweets(qu=query_word, cnt=cnt, sinceid=x , maxitr= 149 , fname=query_word) 
    File "/Users/aqm1152/_acert_/basic/test_collection_dump.py", line 138, in search_tweets 
    res = elastic_search.bulk_es(actions=bulk_content,) 
    File "/Users/aqm1152/_acert_/basic/elasticsearch/acert_basic_elastic_functions.py", line 68, in bulk_es 
    return helpers.bulk(self.es, actions=actions ,stats_only=True) 
    File "/Users/aqm1152/anaconda/lib/python3.5/site-packages/elasticsearch/helpers/__init__.py", line 194, in bulk 
    for ok, item in streaming_bulk(client, actions, **kwargs): 
    File "/Users/aqm1152/anaconda/lib/python3.5/site-packages/elasticsearch/helpers/__init__.py", line 162, in streaming_bulk 
    for result in _process_bulk_chunk(client, bulk_actions, raise_on_exception, raise_on_error, **kwargs): 
    File "/Users/aqm1152/anaconda/lib/python3.5/site-packages/elasticsearch/helpers/__init__.py", line 134, in _process_bulk_chunk 
    raise BulkIndexError('%i document(s) failed to index.' % len(errors), errors) 
elasticsearch.helpers.BulkIndexError: ('46 document(s) failed to index.', [{'index': {'_index': 'twitter', '_id': '866553007252488192', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866552145507700736', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866479151317962752', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866477250459430913', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866455181839486976', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866411931405570048', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866400265573892096', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866399318957318144', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866395810300403713', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866366506124365824', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866228703478636545', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866206827389865984', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866137742476025856', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '866026883284164610', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865968929684029441', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865728096019894273', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865707222453571585', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865675939128029185', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865626970817572865', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865564611591815168', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865553684163211268', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865519159467098113', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865466383684874240', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865362662879895552', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865339244604264449', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865331847710068736', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865251599928700928', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865246748603797505', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865230204293308416', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865229926622011392', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865194609349083136', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865165953612619777', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865165573289902082', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865083343917993984', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865078786655694849', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '865078053134905344', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864963278233096192', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864948505143635968', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864929970702962688', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864871369217015809', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864812084521046016', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864742828550836224', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864662384060792832', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864521704248418304', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864511301221068800', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}, {'index': {'_index': 'twitter', '_id': '864310734817083392', '_type': 'tweet', 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'parse_exception', 'reason': 'field must be either [lat], [lon] or [geohash]'}}, 'status': 400}}]) 

Es scheint, dass die Felder viele, Elasticsearch Probleme zu sein scheint, die mit der Einnahme, was scheint geo Standortdaten zu sein. Außerdem habe ich ungefähr 671 Tweets erzeugt, die ich gezogen habe, aber nur 454 scheint indiziert zu sein, wenn ich in der elastischen Suche zähle, zusätzlich habe ich nicht die 46 Doc-Tweets, die wegen der Geo-Daten fehlschlugen, die ich nicht genau sagen kann es ist.

Hier ist meine Vorlage, die ich für die Indizierung verwenden:

{ 
    "template": "twitter", 
    "settings": { 
     "number_of_shards": 1, 
     "number_of_replicas": 0 
    }, 
    "mappings": { 
     "tweet": { 
     "properties": { 
      "coordinates": { 
       "type": "geo_point" 
      }, 
      "created_at": { 
       "format": "EEE MMM dd HH:mm:ss Z YYYY", 
       "type": "date" 
      }, 
      "entities": { 
       "properties": { 
        "hashtags": { 
        "properties": { 
         "indices": { 
          "type": "long", 
          "index": "not_analyzed" 
         }, 
         "text": { 
          "type": "text" 
         } 
        } 
        }, 
        "urls": { 
        "properties": { 
         "display_url": { 
          "type": "text", 
          "index": "not_analyzed" 
         }, 
         "expanded_url": { 
          "type": "text", 
          "index": "not_analyzed" 
         }, 
         "indices": { 
          "type": "long", 
          "index": "not_analyzed" 
         }, 
         "url": { 
          "type": "text", 
          "index": "not_analyzed" 
         } 
        } 
        } 
       } 
      }, 
      "symbols": { 
       "type": "integer", 
       "index": "not_analyzed" 
      }, 
      "favorite_count": { 
       "type": "double", 
       "index": "not_analyzed" 
      }, 
      "id": { 
       "type": "long" 
      }, 
      "lang": { 
       "type": "text", 
       "index": "analyzed" 
      }, 
      "place": { 
       "properties": { 
        "attributes": { 
        "type": "object" 
        }, 
        "bounding_box": { 
        "type": "geo_point" 
        }, 
        "country": { 
        "type": "text", 
        "index": "no" 
        }, 
        "country_code": { 
        "type": "text" 
        }, 
        "full_name": { 
        "type": "text", 
        "index": "no" 
        }, 
        "id": { 
        "type": "text" 
        }, 
        "name": { 
        "type": "text", 
        "index": "not_analyzed" 
        }, 
        "place_type": { 
        "type": "text" 
        }, 
        "url": { 
        "type": "text" 
        } 
       } 
      }, 
      "retweet_count": { 
       "type": "long" 
      }, 
      "source": { 
       "type": "text" 
      }, 
      "text": { 
       "type": "text" 
      }, 
      "user": { 
       "type":"object", 
       "properties": { 
        "id": { 
        "type": "long" 
        }, 
        "screen_name": { 
        "type": "text" 
        } 
       } 
      } 
     } 
     } 
    } 
} 

Hier mein Code ist, dass ich meinen Generator benutze und verschlucken zum Erstellen helpers.bulk zu:

# Global variable 
tweet_attributes = ['text','source','retweeted', 'retweet_count','place','lang','favorite_count','entities','id','created_at','user:id','user:screen_name','coordinates'] 
def _get_necessary_fields(tweets): 
    doc = defaultdict(dict) 
    fieldInfo = tweet_attributes 
    for tweet in tweets: 
     for fields in fieldInfo : 
      if (len(fields.split(':'))) == 2 : 
       keys = fields.split(':') 
       # nested array at one level 
       doc[keys[0]][keys[1]] = tweet[keys[0]][keys[1]] 
       #TODO implement for more than one level, needs better algo 
      else: 
       # for each field 
       if fields in tweet: 
        doc[fields] = tweet[fields] 
     yield doc 

def _json_for_bulk_body(tweets,el): 
    # TODO refactor this code when you have time : 
    # http://stackoverflow.com/questions/20288770/how-to-use-bulk-api-to-store-the-keywords-in-es-by-using-python 
    structured_json_body = ({ 
       "_op_type" : "index", 
       "_index": el[0], # index name Twitter 
       "_type": el[1][0], # type is tweet 
       "_id": doc['id'], # id of the tweet 
       "_source" :doc} for doc in _get_necessary_fields(tweets)) 
    return structured_json_body 


    **helpers.bulk(self.es, actions=structured_json_body ,stats_only=True)** 

jemand kann bitte erklären Sie mir, warum alle Dokumente außer 46 nicht eingenommen werden, und warum auch die 46 Dokumente nicht eingenommen werden?

+0

Der Fehler 'Feld muss entweder, entsteht [lat], [lon] oder [geohash ] '. Ist es möglich, dass einige Ihrer Dokumente keine "Koordinaten" haben? – Val

+0

ja in einigen dieser Dokumente sind sie null, da sie keinen Standort von twitter haben, wie kann ich diesen Fehler in meinem Mapping vermeiden? – Aboogie

Antwort

0

In Ihrem Code, müssen Sie für diesen Nullzustand zu überprüfen und das Feld nicht gesetzt coordinates wenn der Fall

  # for each field 
      if fields in tweet: 
       if tweet[fields] is not None:     <--- add this check 
        doc[fields] = tweet[fields] 
Verwandte Themen