- html - 出于某种原因,IE8 对我的 Sass 文件中继承的 html5 CSS 不友好?
- JMeter 在响应断言中使用 span 标签的问题
- html - 在 :hover and :active? 上具有不同效果的 CSS 动画
- html - 相对于居中的 html 内容固定的 CSS 重复背景?
我的测试脚本的当前目录结构如下:
project/
script/
__init__.py
map.py
test/
__init.py__
test_map.py
def add(x,y):
return x+y
def map_add(df):
result = df.map(lambda x: (x.key, x.value)).reduceByKey(add)
return result
def add_pyspark_path():
"""
Add PySpark to the PYTHONPATH
"""
import sys
import os
try:
sys.path.append(os.path.join(os.environ['SPARK_HOME'], "python"))
# Spark 1.6
sys.path.append(os.path.join(os.environ['SPARK_HOME'],
"python", "lib", "py4j-0.9-src.zip"))
except KeyError:
print("SPARK_HOME not set")
sys.exit(1)
# To import pyspark
add_pyspark_path()
import unittest
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from script.map import map_add
class PySparkTestCase(unittest.TestCase):
def setUp(self):
# Setup a new spark context for each test
conf = SparkConf()
conf.set("spark.executor.memory", "1g")
conf.set("spark.cores.max", "1")
conf.set("spark.app.name", "nosetest")
self.sc = SparkContext(conf=conf)
def tearDown(self):
self.sc.stop()
# This would go in tests/project_test.py
class MapTests(PySparkTestCase):
def MockDataFrame(self):
# Get a mock dataframe to test the script
sqlContext = SQLContext(self.sc)
rdd = self.sc.parallelize([(1,0), (1,1), (2,0), (2,2)])
schema = [
"key",
"value"
]
dataset = sqlContext.createDataFrame(rdd, schema)
return dataset
def test_add(self):
df = self.MockDataFrame()
result = map_add(df)
print(result.collect())
self.assertEqual(result.count(), 2)
def map_add(df):
result = df.map(lambda x: (x.key, x.value)).reduceByKey(lambda x,y: x+y)
return result
test_add (test.test_map.MapTests) ... 16/01/11 15:38:10 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 5)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 98, in main
command = pickleSer._read_with_length(infile)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 419, in loads
return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'script'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
16/01/11 15:38:10 ERROR TaskSetManager: Task 0 in stage 2.0 failed 1 times; aborting job
ERROR
======================================================================
ERROR: test_add (test.test_map.MapTests)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/Users/datitran/Desktop/pyspark_test/test/test_map.py", line 56, in test_add
print(result.collect())
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/pyspark/rdd.py", line 771, in collect
port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/pyspark/sql/utils.py", line 45, in deco
return f(*a, **kw)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
format(target_id, ".", name), value)
nose.proxy.Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 5, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 98, in main
command = pickleSer._read_with_length(infile)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 419, in loads
return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'script'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/worker.py", line 98, in main
command = pickleSer._read_with_length(infile)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 164, in _read_with_length
return self.loads(obj)
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/serializers.py", line 419, in loads
return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'script'
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:342)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
... 1 more
最佳答案
找到了这个问题的答案,并认为将其分享给社区是个好主意。如果代码库被分成不同的文件夹,这可能很有用。
因此,不知何故,相对重要性不适用于 pyspark,因为导入不会发生在 worker 身上,但我找到了解决这个问题的方法。
首先,在主目录中创建另一个 _init _.py 文件,在本例中称为“项目”,因此结构如下所示:
project/
__init__.py
script/
__init__.py
map.py
test/
__init.py__
test_map.py
class PySparkTestCase(unittest.TestCase):
def setUp(self):
conf = SparkConf()
conf.set("spark.executor.memory", "1g")
conf.set("spark.cores.max", "1")
conf.set("spark.app.name", "nosetest")
conf.setExecutorEnv("PYTHONPATH", "$PYTHONPATH:" +
os.path.abspath(os.path.join(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir), os.pardir)))
self.sc = SparkContext(conf=conf)
from project.script.map import map_add
关于unit-testing - 从 reduceByKey() 调用函数时单元测试期间的导入错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/34724215/
我有以下功能: fun process(t: T, call: (U) -> Unit, map: (T) -> U) = call(map(t)) fun processEmpty(t: T,
关闭。这个问题需要更多focused .它目前不接受答案。 想改善这个问题吗?更新问题,使其仅关注一个问题 editing this post . 4年前关闭。 Improve this questi
我正在实现 SVG Tiny 1.1,但我无法理解“用户单元”的概念。 SVG 1.1 规范将每个没有指定单位(例如“mm”、“cm”、“pt”等)的 定义为“用户单位”。 在实现接口(interfa
我正在学习本教程 - http://blog.dasberg.nl/getting-your-frontend-code-quality-in-order/ - 将前端质量指标推送到 SonarQub
我用了 rails new app --skip-test-unit 因为最初,我认为我可以稍后添加测试。 我开发了我的应用程序的很大一部分。 现在,我想添加 Test::Unit 但我找不到任何有关
您如何对由某些报表引擎(例如Crystal Reports或SQL Server Reporting Services)创建的报表进行“单元测试”? 最佳答案 报告的问题类似于GUI的问题。 如果报表
今天在 Proggit 上,我正在阅读题为“Why Unit Testing Is A Waste of Time”的提交的评论线程。 我并不真正关心文章的前提,而是关心 comment对此作出: T
“单元测试”属于白盒测试还是黑盒测试?还是与其他两种测试完全不同? 最佳答案 我觉得这个article by Kent Beck更多地引用 TDD 和单元测试很好地总结了这一点。基本上,这取决于您实际
这是代码: def filterAcc(p: Tweet => Boolean, acc: TweetSet): TweetSet = { foreach(tweet => if(p(el
我打算编写一个抽象类来测试我所有的 DTO 和 DOMAIN 对象。此类将采用可模板对象(通用类型)并使用反射来获取其中的属性类型,并将一些默认值分配给标识的原始类型,稍后将通过访问它们来断言这些类型
我有一个像这样的简单容器特征: trait Handler { def apply[In, Out](in: In): Out } 当我尝试实现它时: new Handler { def ap
为什么这样编译 scala> import scala.concurrent.Future import scala.concurrent.Future scala> val f: Unit = Fu
您使用什么样的实践来使您的代码对单元测试更加友好? 最佳答案 TDD——首先编写测试,强制你要考虑可测试性和帮助编写实际的代码需要的,而不是你认为可能的需要 接口(interface)重构——使得 m
我在elasticsearch中有文本字段,我想在kibana上可视化词云... 第一步,我们需要标记它们,我使用了“标准标记器” ... 使用这种形式的词云可视化结果如下图所示: 但是我需要的是专有
我有以下方法: override def insertAll(notifications: Seq[PushNotificationEncoded]) (i
我的应用程序服务层中有很多方法正在做这样的事情: public void Execute(PlaceOrderOnHoldCommand command) { var order = _rep
一直在使用 Sails.js,但在为 Controller 设计 Jasmine 单元测试时遇到了麻烦。如果这很明显,请原谅我的无知,因为在过去的 3-4 个月里我才深入研究 JavaScript 开
Closed. This question does not meet Stack Overflow guidelines。它当前不接受答案。 想改善这个问题吗?更新问题,以便将其作为on-topic
在ReKotlin repo README中,有如下代码: data class CounterActionIncrease(val unit: Unit = Unit): Action 代码Unit
我想对一个业务类进行测试,但我遇到了这个问题:其中一个模拟对象与其他类(例如 Sites、URL 和 ComplexObject)有许多依赖关系。 我的问题是:如果我必须在需要测试的方法中使用我的模拟
我是一名优秀的程序员,十分优秀!