- c - 在位数组中找到第一个零
- linux - Unix 显示有关匹配两种模式之一的文件的信息
- 正则表达式替换多个文件
- linux - 隐藏来自 xtrace 的命令
我一直试图了解我的问题是什么,当我试图刮用我在我的django应用程序中创建的函数。该功能转到一个网站收集数据并将其存储在我的数据库中。起初,我尝试使用rq和redis一段时间,但一直收到一条错误消息。所以有人认为我应该试着用芹菜,我做到了。但我现在明白了RQ和芹菜都不是问题所在。因为我得到了和以前一样的错误信息。我已经厌倦了导入它,但仍然收到错误消息,然后我想,如果我在tasks.py文件中有实际的函数,它可能会有影响,但没有。这是我试图在tasks.py中使用的函数。
import requests
from bs4 import BeautifulSoup
from src.blog.models import Post
import random
import re
from django.contrib.auth.models import User
import os
@app.tasks
def p_panties():
def swappo():
user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '
agent_list = [user_one, user_two, user_thr, user_for]
a = random.choice(agent_list)
return a
headers = {
"user-agent": swappo(),
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "en-US,en;q=0.8",
}
pan_url = 'http://www.example.org'
shtml = requests.get(pan_url, headers=headers)
soup = BeautifulSoup(shtml.text, 'html5lib')
video_row = soup.find_all('div', {'class': 'post-start'})
name = 'pan videos'
if os.getenv('_system_name') == 'OSX':
author = User.objects.get(id=2)
else:
author = User.objects.get(id=3)
def youtube_link(url):
youtube_page = requests.get(url, headers=headers)
soupdata = BeautifulSoup(youtube_page.text, 'html5lib')
video_row = soupdata.find_all('p')[0]
entries = [{'text': div,
} for div in video_row]
tubby = str(entries[0]['text'])
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tubby)
cleaned_url = urls[0].replace('?&autoplay=1', '')
return cleaned_url
def yt_id(code):
the_id = code
youtube_id = the_id.replace('https://www.youtube.com/embed/', '')
return youtube_id
def strip_hd(hd, move):
str = hd
new_hd = str.replace(move, '')
return new_hd
entries = [{'href': div.a.get('href'),
'text': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
'embed': youtube_link(div.a.get('href')), #embed
'comments': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
'src': 'https://i.ytimg.com/vi/' + yt_id(youtube_link(div.a.get('href'))) + '/maxresdefault.jpg', #image
'name': name,
'url': div.a.get('href'),
'author': author,
'video': True
} for div in video_row][:13]
for entry in entries:
post = Post()
post.title = entry['text']
title = post.title
if not Post.objects.filter(title=title):
post.title = entry['text']
post.name = entry['name']
post.url = entry['url']
post.body = entry['comments']
post.image_url = entry['src']
post.video_path = entry['embed']
post.author = entry['author']
post.video = entry['video']
post.status = 'draft'
post.save()
post.tags.add("video", "Musica")
return entries
from tasks import *
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Users/ray/Desktop/myheroku/practice/tasks.py", line 5, in <module>
from src.blog.models import Post
File "/Users/ray/Desktop/myheroku/practice/src/blog/models.py", line 3, in <module>
from taggit.managers import TaggableManager
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/taggit/managers.py", line 7, in <module>
from django.contrib.contenttypes.models import ContentType
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/contrib/contenttypes/models.py", line 159, in <module>
class ContentType(models.Model):
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/contrib/contenttypes/models.py", line 160, in ContentType
app_label = models.CharField(max_length=100)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/db/models/fields/__init__.py", line 1072, in __init__
super(CharField, self).__init__(*args, **kwargs)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/db/models/fields/__init__.py", line 166, in __init__
self.db_tablespace = db_tablespace or settings.DEFAULT_INDEX_TABLESPACE
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/conf/__init__.py", line 55, in __getattr__
self._setup(name)
File "/Users/ray/Desktop/myheroku/practice/lib/python3.5/site-packages/django/conf/__init__.py", line 41, in _setup
% (desc, ENVIRONMENT_VARIABLE))
django.core.exceptions.ImproperlyConfigured: Requested setting DEFAULT_INDEX_TABLESPACE, but settings are not configured. You must either define the environment variable DJANGO_SETTINGS_MODULE or call settings.configure() before accessing settings.
import requests
from bs4 import BeautifulSoup
# from src.blog.models import Post
import random
import re
# from django.contrib.auth.models import User
import os
@app.task
def p_panties():
def swappo():
user_one = ' "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0" '
user_two = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)" '
user_thr = ' "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko" '
user_for = ' "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0" '
agent_list = [user_one, user_two, user_thr, user_for]
a = random.choice(agent_list)
return a
headers = {
"user-agent": swappo(),
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "en-US,en;q=0.8",
}
pan_url = 'http://www.example.org'
shtml = requests.get(pan_url, headers=headers)
soup = BeautifulSoup(shtml.text, 'html5lib')
video_row = soup.find_all('div', {'class': 'post-start'})
name = 'pan videos'
# if os.getenv('_system_name') == 'OSX':
# author = User.objects.get(id=2)
# else:
# author = User.objects.get(id=3)
def youtube_link(url):
youtube_page = requests.get(url, headers=headers)
soupdata = BeautifulSoup(youtube_page.text, 'html5lib')
video_row = soupdata.find_all('p')[0]
entries = [{'text': div,
} for div in video_row]
tubby = str(entries[0]['text'])
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tubby)
cleaned_url = urls[0].replace('?&autoplay=1', '')
return cleaned_url
def yt_id(code):
the_id = code
youtube_id = the_id.replace('https://www.youtube.com/embed/', '')
return youtube_id
def strip_hd(hd, move):
str = hd
new_hd = str.replace(move, '')
return new_hd
entries = [{'href': div.a.get('href'),
'text': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
'embed': youtube_link(div.a.get('href')), #embed
'comments': strip_hd(strip_hd(div.h2.text, '– Official video HD'), '– Oficial video HD').lstrip(),
'src': 'https://i.ytimg.com/vi/' + yt_id(youtube_link(div.a.get('href'))) + '/maxresdefault.jpg', #image
'name': name,
'url': div.a.get('href'),
# 'author': author,
'video': True
} for div in video_row][:13]
#
# for entry in entries:
# post = Post()
# post.title = entry['text']
# title = post.title
# if not Post.objects.filter(title=title):
# post.title = entry['text']
# post.name = entry['name']
# post.url = entry['url']
# post.body = entry['comments']
# post.image_url = entry['src']
# post.video_path = entry['embed']
# post.author = entry['author']
# post.video = entry['video']
# post.status = 'draft'
# post.save()
# post.tags.add("video", "Musica")
return entries
[2016-08-13 08:31:17,222: INFO/MainProcess] Received task: tasks.p_panties[e196c6bf-2b87-4bb2-ae11-452e3c41434f]
[2016-08-13 08:31:17,238: INFO/Worker-4] Starting new HTTP connection (1): www.example.org
[2016-08-13 08:31:17,582: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:18,314: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:18,870: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:19,476: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:20,089: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:20,711: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:21,218: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:21,727: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:22,372: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:22,785: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:23,375: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:23,983: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:24,396: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:25,003: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:25,621: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:26,029: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:26,446: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:27,261: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:27,671: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:28,082: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:28,694: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:29,311: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:29,922: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:30,535: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:31,154: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:31,765: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:32,387: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:32,992: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:33,611: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:34,030: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:34,635: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:35,041: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:35,659: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:36,278: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:36,886: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:37,496: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:37,913: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:38,564: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:39,143: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:39,754: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:40,409: INFO/Worker-4] Starting new HTTP connection (1): example.org
[2016-08-13 08:31:40,992: INFO/MainProcess] Task tasks.p_panties[e196c6bf-2b87-4bb2-ae11-452e3c41434f] succeeded in 23.767645187006565s: [{'src': 'https://i.ytimg.com/vi/3bU-AtShW7Y/maxresdefault.jpg', 'name': 'pan videos', 'url':...
environ\
|-src\
|-blog\
|-migrations\
|-static\
|-templates\
|-templatetags\
|-__init__.py
|-admin.py
|-forms.py
|-models
|-tasks
|-urls
|-views
最佳答案
你需要设置django
您似乎试图在python shell中运行任务,这更有可能是因为当您注释掉django模型部分时,您的代码可以工作。
所以问题是,当运行纯python shell时,需要设置django,才能正常运行。当您在manage.py shell中运行它时,manage.py会为您处理或设置它,但是通过python脚本执行它需要手动设置。这就是缺少django_settings_模块错误的原因。
您似乎还使用了定义的模型,为了能够将它们导入到python脚本中,您需要将项目根文件夹的路径添加到当前python路径中。
最后,您需要告诉django您的设置文件在哪里(在设置django之前),在manage.py文件中,您应该具有如下内容:
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "myapp.settings")
os.environ.setdefault("DJANGO_SETTINGS_MODULE", DEFAULT_SETTINGS_MODULE)
import sys, os
sys.path.insert(0, "/path/to/parent/of/src") # /home/projects/my-crawler
from manage import DEFAULT_SETTINGS_MODULE
os.environ.setdefault("DJANGO_SETTINGS_MODULE", DEFAULT_SETTINGS_MODULE)
import django
django.setup()
... The rest of your script ...
.delay()
或
.apply_async()
,以确保代码在后台运行。
from redis import StrictRedis
redis = StrictRedis(host='localhost', port=6379, db=0)
redis.set("scraping:tasks:results:TASK-ID-HERE", json.dumps(entries))
with redis.pipeline() as pipe:
for item in entries:
pipe.rpush("scraping:tasks:results", json.dumps(item))
pipe.execute()
@celery_app.task
def handle_scraping_results(entries):
you do whatever you want with the entries array now
handle_scraping_results.delay(entries)
redis_keys = redis.get("scraping:tasks:results:*")
for key in redis_keys:
value_of_redis_key = redis.get(key)
entries = json.loads(entries)
for entry in entries:
post = Post()
post.title = entry['text']
title = post.title
if not Post.objects.filter(title=title):
post.title = entry['text']
post.name = entry['name']
post.url = entry['url']
post.body = entry['comments']
post.image_url = entry['src']
post.video_path = entry['embed']
post.author = entry['author']
post.video = entry['video']
post.status = 'draft'
post.save()
post.tags.add("video", "Musica")
关于python - 异步抓取并使用django celery和redis存储结果并存储我的正确方法是什么?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38933035/
我有一个关于 Redis Pubsub 的练习,如下所示: 如果发布者发布消息但订阅者没有收到服务器崩溃。订阅者如何在重启服务器时收到该消息? 请帮帮我,谢谢! 最佳答案 在这种情况下,消息将永远消失
我们正在使用 Service Stack 的 RedisClient 的 BlockingDequeue 来保存一些数据,直到它可以被处理。调用代码看起来像 using (var client =
我有一个 Redis 服务器和多个 Redis 客户端。每个 Redis 客户端都是一个 WebSocket+HTTP 服务器,其中包括管理 WebSocket 连接。这些 WebSocket+HTT
我有多个 Redis 实例。我使用不同的端口创建了一个集群。现在我想将数据从预先存在的 redis 实例传输到集群。我知道如何将数据从一个实例传输到集群,但是当实例多于一个时,我无法做到这一点。 最佳
配置:三个redis集群分区,跨三组一主一从。当 Master 宕机时,Lettuce 会立即检测到中断并开始重试。但是,Lettuce 没有检测到关联的 slave 已经将自己提升为 master
我想根据从指定集合中检索这些键来删除 Redis 键(及其数据集),例如: HMSET id:1 password 123 category milk HMSET id:2 password 456
我正在编写一个机器人(其中包含要禁用的命令列表),用于监视 Redis。它通过执行禁用命令,例如 (rename-command ZADD "")当我重新启动我的机器人时,如果要禁用的命令列表发生变化
我的任务是为大量听众使用发布/订阅。这是来自 docs 的订阅的简化示例: r = redis.StrictRedis(...) p = r.pubsub() p.subscribe('my-firs
我一直在阅读有关使用 Redis 哨兵进行故障转移的内容。我打算有1个master+1个slave,如果master宕机超过1分钟,就把slave变成master。我知道这在 Sentinel 中是
与仅使用常规 Redis 和创建分片相比,使用 Redis 集群有哪些优势? 在我看来,Redis Cluster 更注重数据安全(让主从架构解决故障)。 最佳答案 我认为当您需要在不丢失任何数据的情
由于 Redis 以被动和主动方式使 key 过期, 有没有办法得到一个 key ,即使它的过期时间已过 (但 在 Redis 中仍然存在 )? 最佳答案 DEBUG OBJECT myKey 将返回
我想用redis lua来实现monitor命令,而不是redis-cli monitor。但我不知道怎么办。 redis.call('monitor') 不起作用。 最佳答案 您不能从 Redis
我读过 https://github.com/redisson/redisson 我发现有几个 Redis 复制设置(包括对 AWS ElastiCache 和 Azure Redis 缓存的支持)
Microsoft.AspNet.SignalR.Redis 和 StackExchange.Redis.Extensions.Core 在同一个项目中使用。前者需要StackExchange.Red
1. 认识 Redis Redis(Remote Dictionary Server)远程词典服务器,是一个基于内存的键值对型 NoSQL 数据库。 特征: 键值(key-value)型,value
1. Redis 数据结构介绍 Redis 是一个 key-value 的数据库,key 一般是 String 类型,但 value 类型多种多样,下面就举了几个例子: value 类型 示例 Str
1. 什么是缓存 缓存(Cache) 就是数据交换的缓冲区,是存贮数据的临时地方,一般读写性能较高。 缓存的作用: 降低后端负载 提高读写效率,降低响应时间 缓存的成本: 数据一致性成本 代码维护成本
我有一份记录 list 。对于我的每条记录,我都需要进行一些繁重的计算,因为我要在Redis中创建反向索引。为了达到到达记录,需要在管道中执行多个redis命令(sadd为100 s + set为1
我有一个三节点Redis和3节点哨兵,一切正常,所有主服务器和从属服务器都经过验证,并且哨兵配置文件已与所有Redis和哨兵节点一起更新,但是问题是当Redis主服务器关闭并且哨兵希望选举失败者时再次
我正在尝试计算Redis中存储的消息之间的响应时间。但是我不知道该怎么做。 首先,我必须像这样存储chat_messages的时间流 ZADD conversation:CONVERSATION_ID
我是一名优秀的程序员,十分优秀!