apache-spark
  • pyspark
  • 2016-10-05 3 views 0 likes 
    0
      import os 
         import sys 
         os.chdir("/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/bin") 
         os.curdir 
         if 'SPARK_HOME' not in os.environ: 
          os.environ['SPARK_HOME'] = '/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7' 
         SPARK_HOME = os.environ['SPARK_HOME'] 
         sys.path.insert(0,os.path.join(SPARK_HOME,"python")) 
         sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib")) 
         sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip")) 
         sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.9-src.zip")) 
        from pyspark import SparkContext 
        from pyspark import SparkConf 
    
        # Optionally configure Spark Settings 
        conf=SparkConf() 
        conf.set("spark.executor.memory", "1g") 
        conf.set("spark.cores.max", "2") 
    
        conf.setAppName("V2 Maestros") 
    
        ## Initialize SparkContext. Run only once. Otherwise you get multiple 
        #Context Error. 
        sc = SparkContext('local', conf=conf) 
    
        #Test to make sure everything works. 
        lines=sc.textFile("auto-data.csv") 
        lines.count() 
    

    Dies ist der Fehler, die aufgetreten sind. Es war ein einfaches Programm, das die Anzahl der Eingabe der Datei berechnete, aber dieser Fehler kam auf. Ich habe die Datei in beiden Orten im Code erwähnt, obwohl das Ergebnis das gleiche ist.Py4JJavaError: Ein Fehler ist aufgetreten, während z Aufruf: org.apache.spark.api.python.PythonRDD.collectAndServe

    Py4JJavaError        Traceback (most recent call last) 
    <ipython-input-6-5c9242495358> in <module>() 
         1 lines = sc.textFile("auto-save.csv") 
    ----> 2 lines.count() 
    
    /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in count(self) 
        1006   3 
        1007   """ 
    -> 1008   return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() 
        1009 
        1010  def stats(self): 
    
    /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in sum(self) 
        997   6.0 
        998   """ 
    --> 999   return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add) 
        1000 
        1001  def count(self): 
    
    /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in fold(self, zeroValue, op) 
        871   # zeroValue provided to each partition is unique from the one provided 
        872   # to the final reduce call 
    --> 873   vals = self.mapPartitions(func).collect() 
        874   return reduce(op, vals, zeroValue) 
        875 
    
    /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/rdd.pyc in collect(self) 
        774   """ 
        775   with SCCallSiteSync(self.context) as css: 
    --> 776    port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) 
        777   return list(_load_from_socket(port, self._jrdd_deserializer)) 
        778 
    
    /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 
        931   answer = self.gateway_client.send_command(command) 
        932   return_value = get_return_value(
    --> 933    answer, self.gateway_client, self.target_id, self.name) 
        934 
        935   for temp_arg in temp_args: 
    
    /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/pyspark/sql/utils.pyc in deco(*a, **kw) 
        61  def deco(*a, **kw): 
        62   try: 
    ---> 63    return f(*a, **kw) 
        64   except py4j.protocol.Py4JJavaError as e: 
        65    s = e.java_exception.toString() 
    
    /home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 
        310     raise Py4JJavaError(
        311      "An error occurred while calling {0}{1}{2}.\n". 
    --> 312      format(target_id, ".", name), value) 
        313    else: 
        314     raise Py4JError(
    
    Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe. 
    : org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: file:/home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/auto-save.csv 
        at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:287) 
        at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:229) 
        at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:315) 
        at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:200) 
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248) 
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246) 
        at scala.Option.getOrElse(Option.scala:121) 
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:246) 
        at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) 
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248) 
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246) 
        at scala.Option.getOrElse(Option.scala:121) 
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:246) 
        at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:53) 
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248) 
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246) 
        at scala.Option.getOrElse(Option.scala:121) 
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:246) 
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1911) 
        at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:893) 
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) 
        at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) 
        at org.apache.spark.rdd.RDD.withScope(RDD.scala:358) 
        at org.apache.spark.rdd.RDD.collect(RDD.scala:892) 
        at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453) 
        at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala) 
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 
        at java.lang.reflect.Method.invoke(Method.java:497) 
        at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237) 
        at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) 
        at py4j.Gateway.invoke(Gateway.java:280) 
        at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128) 
        at py4j.commands.CallCommand.execute(CallCommand.java:79) 
        at py4j.GatewayConnection.run(GatewayConnection.java:211) 
        at java.lang.Thread.run(Thread.java:745) 
    
    +0

    Ich denke, die Fehlermeldung ist ziemlich selbsterklärend . Spark sucht nach einer Datei in Ihrem lokalen Dateisystem unter '/ home/hp/Downloads/spark-2.0.0-bin-hadoop2.7/auto-save.csv' und findet keine Datei. Im Allgemeinen ist es wahrscheinlich besser, einen absoluten Pfad in der Funktion 'textFile' zu ​​verwenden. – santon

    Antwort

    0

    sollten Sie Ihre Ausgabe speichern als

    lines=sc.textFile("hdfs:///tmp/auto-data.csv") 
    

    oder nur

    lines=sc.textFile("/tmp/auto-data.csv") 
    

    Dieser Befehl Ihre Ausgabe hdfs schreiben würde

    Verwandte Themen