gpt4 book ai didi

python-2.7 - 在 Google Cloud 中通过 python 运行的 BigQuery 结果与在 MAC 上运行的结果不匹配

转载 作者:行者123 更新时间:2023-12-04 18:36:47 24 4
gpt4 key购买 nike

我有一个 python 应用程序,它在 BigQuery 上运行查询并将结果附加到文件中。我已经在 MAC 工作站(优胜美地)和 GC 实例(ubuntu 14.1)上运行它,浮点的结果不同。我怎样才能使它们相同?它们的python环境在两者上都是相同的。

run on google cloud instance 1120224,2015-04-06,23989,866,55159.71274162368,0.04923989554019882,0.021414467106578683,0.03609987911125933,63.69481840834143 54897577,2015-04-06,1188089,43462,2802473.708558333,0.051049132980100984,0.021641920553251377,0.03658143455582873,64.4810111950286

run on mac workstation 1120224,2015-04-06,23989,866,55159.712741623654,0.049239895540198794,0.021414467106578683,0.03609987911125933,63.694818408341405 54897577,2015-04-06,1188089,43462,2802473.708558335,0.05104913298010102,0.021641920553251377,0.03658143455582873,64.48101119502864


import sys
import pdb
import json
from collections import OrderedDict
from csv import DictWriter
from pprint import pprint
from apiclient import discovery
from oauth2client import tools

import functools
import argparse
import httplib2

import time
from subprocess import call



def authenticate_SERVICE_ACCOUNT(service_acct_email, private_key_path):
""" Generic authentication through a service accounts.

Args:
service_acct_email: The service account email associated
with the private key private_key_path: The path to the private key file
"""

from oauth2client.client import SignedJwtAssertionCredentials

with open(private_key_path, 'rb') as pk_file:
key = pk_file.read()

credentials = SignedJwtAssertionCredentials(
service_acct_email,
key,
scope='https://www.googleapis.com/auth/bigquery')

http = httplib2.Http()
auth_http = credentials.authorize(http)

return discovery.build('bigquery', 'v2', http=auth_http)

def create_query(number_of_days_ago):
""" Create a query

Args:
number_of_days_ago: Default value of 1 gets yesterday's data

"""
q = 'SELECT xxxxxxxxxx'


return q;

def translate_row(row, schema):
"""Apply the given schema to the given BigQuery data row.
Args:
row: A single BigQuery row to transform.
schema: The BigQuery table schema to apply to the row, specifically
the list of field dicts.
Returns:
Dict containing keys that match the schema and values that match
the row.

Adpated from bigquery client
https://github.com/tylertreat/BigQuery-Python/blob/master/bigquery/client.py
"""

log = {}
#pdb.set_trace()
# Match each schema column with its associated row value
for index, col_dict in enumerate(schema):
col_name = col_dict['name']
row_value = row['f'][index]['v']

if row_value is None:
log[col_name] = None
continue

# Cast the value for some types
if col_dict['type'] == 'INTEGER':
row_value = int(row_value)

elif col_dict['type'] == 'FLOAT':
row_value = float(row_value)

elif col_dict['type'] == 'BOOLEAN':
row_value = row_value in ('True', 'true', 'TRUE')

log[col_name] = row_value

return log

def extractResult(queryReply):
""" Extract a result from the query reply. Uses schema and rows to translate.

Args:
queryReply: the object returned by bigquery

"""
#pdb.set_trace()
result = []
schema = queryReply.get('schema', {'fields': None})['fields']
rows = queryReply.get('rows',[])

for row in rows:
result.append(translate_row(row, schema))
return result


def writeToCsv(results, filename, ordered_fieldnames, withHeader=True):
""" Create a csv file from a list of rows.

Args:
results: list of rows of data (first row is assumed to be a header)
order_fieldnames: a dict with names of fields in order desired - names must exist in results header
withHeader: a boolen to indicate whether to write out header -
Set to false if you are going to append data to existing csv

"""
try:
the_file = open(filename, "w")
writer = DictWriter(the_file, fieldnames=ordered_fieldnames)
if withHeader:
writer.writeheader()
writer.writerows(results)
the_file.close()
except:
print "Unexpected error:", sys.exc_info()[0]
raise


def runSyncQuery (client, projectId, query, timeout=0):
results = []
try:
print 'timeout:%d' % timeout
jobCollection = client.jobs()
queryData = {'query':query,
'timeoutMs':timeout}

queryReply = jobCollection.query(projectId=projectId,
body=queryData).execute()

jobReference=queryReply['jobReference']

# Timeout exceeded: keep polling until the job is complete.
while(not queryReply['jobComplete']):
print 'Job not yet complete...'
queryReply = jobCollection.getQueryResults(
projectId=jobReference['projectId'],
jobId=jobReference['jobId'],
timeoutMs=timeout).execute()

# If the result has rows, print the rows in the reply.
if('rows' in queryReply):
#print 'has a rows attribute'
#pdb.set_trace();
result = extractResult(queryReply)
results.extend(result)

currentPageRowCount = len(queryReply['rows'])

# Loop through each page of data
while('rows' in queryReply and currentPageRowCount < int(queryReply['totalRows'])):
queryReply = jobCollection.getQueryResults(
projectId=jobReference['projectId'],
jobId=jobReference['jobId'],
startIndex=currentRow).execute()
if('rows' in queryReply):
result = extractResult(queryReply)
results.extend(result)
currentRow += len(queryReply['rows'])

except AccessTokenRefreshError:
print ("The credentials have been revoked or expired, please re-run"
"the application to re-authorize")

except HttpError as err:
print 'Error in runSyncQuery:', pprint.pprint(err.content)

except Exception as err:
print 'Undefined error' % err

return results;


# Main
if __name__ == '__main__':
# Name of file
FILE_NAME = "results.csv"

# Default prior number of days to run query
NUMBER_OF_DAYS = "1"

# BigQuery project id as listed in the Google Developers Console.
PROJECT_ID = 'xxxxxx'

# Service account email address as listed in the Google Developers Console.
SERVICE_ACCOUNT = 'xxxxxx@developer.gserviceaccount.com'
KEY = "/usr/local/xxxxxxxx"

query = create_query(NUMBER_OF_DAYS)

# Authenticate
client = authenticate_SERVICE_ACCOUNT(SERVICE_ACCOUNT, KEY)

# Get query results
results = runSyncQuery (client, PROJECT_ID, query, timeout=0)
#pdb.set_trace();

# Write results to csv without header
ordered_fieldnames = OrderedDict([('f_split',None),('m_members',None),('f_day',None),('visitors',None),('purchasers',None),('demand',None), ('dmd_per_mem',None),('visitors_per_mem',None),('purchasers_per_visitor',None),('dmd_per_purchaser',None)])
writeToCsv(results, FILE_NAME, ordered_fieldnames, False)

# Backup current data
backupfilename = "data_bk-" + time.strftime("%y-%m-%d") + ".csv"
call(['cp','../data/data.csv',backupfilename])

# Concatenate new results to data
with open("../data/data.csv", "ab") as outfile:
with open("results.csv","rb") as infile:
line = infile.read()
outfile.write(line)

最佳答案

您提到这些来自浮点数据的总和。正如 Felipe 所说,浮点数很尴尬。它违反了我们倾向于假设的一些数学恒等式。

在这种情况下,关联属性是咬我们的属性。也就是说,通常是 (A+B)+C == A+(B+C) .但是,在浮点数学中,情况并非如此。每个操作都是一个近似值;如果您使用“近似”函数进行包装,您可以更好地看到这一点:approx(approx(A+B) + C)明显不同于 approx(A + approx(B+C)) .

如果您考虑 bigquery 如何计算聚合,它会构建一个执行树,并计算要在树的叶子处聚合的值。当这些答案准备好时,它们会被传递回树的更高级别并聚合(假设它们已添加)。 “当他们准备好”部分使其具有不确定性。

节点可能会以 A,B,C 的顺序返回结果。第一次和C,A, B第二次。这意味着分配顺序将会改变,因为你会得到 approx(approx(A + B) + C)第一次和approx(approx(C, A) + B)第二次。请注意,由于我们正在处理排序,因此看起来交换属性是有问题的,但事实并非如此; A+B在 float 数学中与 B+A 相同.问题实际上是您要添加不相关的部分结果。

浮点数学具有各种令人讨厌的属性,如果您依赖精度,通常应该避免使用。

关于python-2.7 - 在 Google Cloud 中通过 python 运行的 BigQuery 结果与在 MAC 上运行的结果不匹配,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29496673/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com