gpt4 book ai didi

html - 解析 html 文件时 Fonduer max_storage_temp_tutorial 错误

转载 作者:行者123 更新时间:2023-11-30 09:46:28 25 4
gpt4 key购买 nike

我正在学习使用 Fonduer 从文本文档构建知识库。在附加的 Jupyter 笔记本中执行 max_storage_temp_tutorial 教程时,我在尝试执行以下代码时收到错误:

corpus_parser = Parser(structural=True, lingual=True, visual=True, pdf_path=pdf_path)
%time corpus_parser.apply(doc_preprocessor, parallelism=PARALLEL)

以下是我收到的错误:

UnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)

以下是错误的堆栈跟踪:

[INFO] fonduer.utils.udf - Clearing existing...
[INFO] fonduer.utils.udf - Running UDF...
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<timed eval> in <module>()

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply(self, xs, clear, parallelism, progress_bar, count, **kwargs)
48 self.logger.info("Running UDF...")
49 if parallelism is None or parallelism < 2:
---> 50 self.apply_st(xs, progress_bar, clear=clear, count=count, **kwargs)
51 else:
52 self.apply_mt(xs, parallelism, clear=clear, **kwargs)

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/fonduer/utils/udf.py in apply_st(self, xs, progress_bar, count, **kwargs)
81
82 # Commit session and close progress bar if applicable
---> 83 udf.session.commit()
84 if pb:
85 pb.close()

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self)
941 raise sa_exc.InvalidRequestError("No transaction is begun.")
942
--> 943 self.transaction.commit()
944
945 def prepare(self):

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in commit(self)
465 self._assert_active(prepared_ok=True)
466 if self._state is not PREPARED:
--> 467 self._prepare_impl()
468
469 if self._parent is None or self.nested:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _prepare_impl(self)
445 if self.session._is_clean():
446 break
--> 447 self.session.flush()
448 else:
449 raise exc.FlushError(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in flush(self, objects)
2252 try:
2253 self._flushing = True
-> 2254 self._flush(objects)
2255 finally:
2256 self._flushing = False

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects)
2378 except:
2379 with util.safe_reraise():
-> 2380 transaction.rollback(_capture_exception=True)
2381
2382 def bulk_save_objects(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py in __exit__(self, type_, value, traceback)
64 self._exc_info = None # remove potential circular references
65 if not self.warn_only:
---> 66 compat.reraise(exc_type, exc_value, exc_tb)
67 else:
68 if not compat.py3k and self._exc_info and self._exc_info[1]:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause)
247 if value.__traceback__ is not tb:
248 raise value.with_traceback(tb)
--> 249 raise value
250
251 else:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/session.py in _flush(self, objects)
2342 self._warn_on_events = True
2343 try:
-> 2344 flush_context.execute()
2345 finally:
2346 self._warn_on_events = False

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute(self)
384 while set_:
385 n = set_.pop()
--> 386 n.execute_aggregate(self, set_)
387 else:
388 for rec in topological.sort(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/unitofwork.py in execute_aggregate(self, uow, recs)
666 [self.state] +
667 [r.state for r in our_recs],
--> 668 uow)
669
670 def __repr__(self):

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in save_obj(base_mapper, states, uowtransaction, single)
179 _emit_insert_statements(base_mapper, uowtransaction,
180 cached_connections,
--> 181 mapper, table, insert)
182
183 _finalize_insert_update_commands(

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/orm/persistence.py in _emit_insert_statements(base_mapper, uowtransaction, cached_connections, mapper, table, insert, bookkeeping)
828
829 c = cached_connections[connection].\
--> 830 execute(statement, multiparams)
831
832 if bookkeeping:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in execute(self, object, *multiparams, **params)
946 raise exc.ObjectNotExecutableError(object)
947 else:
--> 948 return meth(self, multiparams, params)
949
950 def _execute_function(self, func, multiparams, params):

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/sql/elements.py in _execute_on_connection(self, connection, multiparams, params)
267 def _execute_on_connection(self, connection, multiparams, params):
268 if self.supports_execution:
--> 269 return connection._execute_clauseelement(self, multiparams, params)
270 else:
271 raise exc.ObjectNotExecutableError(self)

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_clauseelement(self, elem, multiparams, params)
1058 compiled_sql,
1059 distilled_params,
-> 1060 compiled_sql, distilled_params
1061 )
1062 if self._has_events or self.engine._has_events:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1198 parameters,
1199 cursor,
-> 1200 context)
1201
1202 if self._has_events or self.engine._has_events:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context)
1414 )
1415 else:
-> 1416 util.reraise(*exc_info)
1417
1418 finally:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause)
247 if value.__traceback__ is not tb:
248 raise value.with_traceback(tb)
--> 249 raise value
250
251 else:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args)
1168 statement,
1169 parameters,
-> 1170 context)
1171 elif not parameters and context.no_parameters:
1172 if self.dialect._has_events:

~/anaconda3/envs/fonduer/lib/python3.6/site-packages/sqlalchemy/dialects/postgresql/psycopg2.py in do_executemany(self, cursor, statement, parameters, context)
681 extras.execute_batch(cursor, statement, parameters)
682 else:
--> 683 cursor.executemany(statement, parameters)
684
685 @util.memoized_instancemethod

UnicodeEncodeError: 'ascii' codec can't encode character '\uf0b7' in position 6282: ordinal not in range(128)

在打印executemany函数的输入时,我发现存在无效的unicode字符,但我不知道如何继续。

请注意:

  1. 我已使用教程中的 download_data.sh 脚本下载了 pdf 和 html 文件
  2. 我已经安装了安装文档中提到的所有先决条件

    • 适用于 Windows 的 Ubuntu 16.04 bash
    • PostgreSQL 版本:[9.5.13]
    • Poppler Utils 版本:[0.41.0-0ubuntu1.7]
    • 火锅版本:[0.2.3]
  3. 教程可以在 here 找到.

  4. 我使用 Ubuntu for Windows 来运行所需的服务

最佳答案

该问题是由于 postgreSQL 数据库的编码造成的。 Fonduer 需要 UTF-8 编码,而 Windows 默认情况下使用不同的编码。

我需要做的就是:

1.删除所需的数据库。

dropdb stg_temp_max

2.新建一个UTF-8编码的数据库。

 createdb -E UTF8 -T template0 stg_temp_max

关于html - 解析 html 文件时 Fonduer max_storage_temp_tutorial 错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/51901371/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com