gpt4 book ai didi

json - 使用 SAS BASE 解析来自 JSON 的变量中带有 unicode 字符的表

转载 作者:行者123 更新时间:2023-12-03 22:42:56 27 4
gpt4 key购买 nike

我在用 vars 中的 unicode char 解析 JSON 时遇到了问题。
所以,我有下一个 JSON(示例):

 {  
"SASJSONExport":"1.0",
"SASTableData+TEST":[
{
"\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
"\u0421\u0440\u0435\u0434\u043d\u0435\u0435":4,
"\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0031"
},
{
"\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
"\u0421\u0440\u0435\u0434\u043d\u0435\u0435":2,
"\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0032"
},
{
"\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":1,
"\u0421\u0440\u0435\u0434\u043d\u0435\u0435":42,
"\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0033"
}
]
}

要从 JSON 解析表,我使用 SAS 引擎:
libname jsonfl JSON fileref=injson ;

代码更高解码单元格中的字符,但变量的名称看起来像缺少 val:
+--------------+---------------------------+------------+---------+---------+
| ordinal_root | ordinal_SASTableData_TEST | __________ | _______ | ______ |
+--------------+---------------------------+------------+---------+---------+
| 1 | 1 | 2 | 4 | Что-то1 |
| 1 | 2 | 2 | 2 | Что-то2 |
| 1 | 3 | 1 | 42 | Что-то3 |
+--------------+---------------------------+------------+---------+---------+

标题必须如下所示:
+--------------+---------------------------+------------+---------+---------+
| ordinal_root | ordinal_SASTableData_TEST | Переменная | Среднее | Строка |
+--------------+---------------------------+------------+---------+---------+

所以我决定用这样的名字替换 unicoded 变量字符 DIM_N_ .
为此,我必须找到所有与下一个正则表达式一致的字符串: /([\s\w\d\\]+)\"\:/
但是,要从 json 中获取字符串,我需要将下一个字符设置为 delim '{','}','[',']',',' .
但是如果把那个字符设置为 dlm ,我就不会再组装 json 了。
所以我决定在字符之前粘贴 ~将其设置为 dlm。
data delim;
infile injson lrecl=1073741823 nopad;
file delim;
input char1 $char1. @@;
if char1 in ('{','}','[',']',',') then
put '7E'x;
put char1 $CHAR1. @@;
run;

我得到了无效的 json 文件:
~
{"SASJSONExport":"1.0"~
,"SASTableData+TEST":~
[ ~
{"\u0056\u0061\u0072":2~
,"\u006d\u0065\u0061\u006e":4~
,"\u004e\u0061\u006d\u0065":"\u0073\u006d\u0074\u0068\u0031"~
}~
, ~
{"\u0056\u0061\u0072":2~
,"\u006d\u0065\u0061\u006e":2~
,"\u004e\u0061\u006d\u0065":"\u0073\u006d\u0074\u0068\u0032"~
}~
, ~
{"\u0056\u0061\u0072":1~
,"\u006d\u0065\u0061\u006e":42~
,"\u004e\u0061\u006d\u0065":"\u0073\u006d\u0074\u0068\u0033"~
} ~
]~
}

因此,下一步我将解析 JSON 并使用 ~作为分隔符:
data transfer;
length column $2000;
retain r;
infile delim delimiter='7E'x nopad;
input char1 : $4000. @@;
r = prxparse('/([\s\w\d\\]+)\"\:/');
pos = prxmatch(r,char1);
column = prxposn(r,1,char1);
n= _n_;
run;

它有效......但我觉得那些是 太糟糕的做法 , 并且它有限制。

UPD1
选项,
options vAlidfmtname=long VALIDMEMNAME=extend VALIDVARNAME=any;

返回:
+--------------+---------------------------+----------------------------+---------+--------------+
| ordinal_root | ordinal_SASTableData_TEST | __________ | _______ | ______ |
+--------------+---------------------------+----------------------------+---------+--------------+
| 1 | 1 | авфа2 фвафв = фвыа - тфвыа | 4 | Что-то1 ,,,, |
| 1 | 2 | авфа2 фвафв = фвыа - тфвыа | 2 | Что-то2 |
| 1 | 3 | авфа2 фвафв = фвыа - тфвыа | 2017 | Что-то3 |
+--------------+---------------------------+----------------------------+---------+--------------+

所以我的问题是:
  • 我可以在没有 infile 的情况下解码整个文件吗?声明?
  • 我可以用infile delimiter ,但设置 smth 选项不删除分隔符?

  • 欢迎适当批评。

    最佳答案

    UPD 我找到了解决方案,而无需手动编辑 json 映射文件,而是使用了正则表达式。

    libname _all_ clear;
    filename _all_ clear;
    filename _PDFOUT temp;
    filename _GSFNAME temp;
    proc datasets lib=work kill memtype=data nolist; quit;
    filename jsf '~/sasuser.v94/.json' encoding='utf-8';
    data _null_;
    file jsf;
    length js varchar(*);
    retain js;
    input;
    js=unicode(_infile_);
    put js;
    datalines;
    {
    "SASJSONExport":"1.0",
    "SASTableData+TEST":[
    {
    "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
    "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":4,
    "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0031"
    },
    {
    "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
    "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":2,
    "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0032"
    },
    {
    "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":1,
    "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":42,
    "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0033"
    }
    ]
    }
    ;
    run;
    filename jsm '~/sasuser.v94/.json.map' encoding='utf-8';
    libname jsd json fileref=jsf map=jsm automap=replace;
    libname jsm json fileref=jsm;
    data jsmm;
    merge jsm.datasets jsm.datasets_variables;
    by ordinal_DATASETS;
    run;
    proc sort data=jsmm; by ordinal_root ordinal_DATASETS; run;
    data _null_;
    set work.jsmm end=last;
    if _N_=1 then do;
    length s varchar(*) ds varchar(*);
    retain s ds prx;
    s='{"DATASETS":[';
    ds='';
    prx=prxparse('/[^_]/');
    end;
    if ds=dsname then s=s||',';
    else do;
    ds=dsname;
    if _N_^=1 then s=s||']},';
    s=cats(s,'{"DSNAME":"',ds,'","TABLEPATH":"',tablepath,'","VARIABLES":[');
    end;
    s=cats(s,'{"NAME":"',name,'","TYPE":"',type,'","PATH":"',path,'"');
    if prxmatch(prx,name) > length(name) then
    s=cats(s,',"LABEL":"',scan(path,-1,'/'),'"');
    s=s||'}';
    if last then do;
    s=s||']}]}';
    file jsm;
    put s;
    end;
    run;
    libname jsd json fileref=jsf map=jsm;
    proc print data=jsd.SASTableData_TEST label noobs; run;

    解决方案的第一个变体 这是快速的解决方案。首先准备输入数据:
    libname _all_ clear;
    filename _all_ clear;
    filename jsf '~/sasuser.v94/.json' encoding='utf-8';
    data _null_;
    file jsf;
    length js varchar(*);
    input;
    js=unicode(_infile_);
    put js;
    datalines;
    {
    "SASJSONExport":"1.0",
    "SASTableData+TEST": [
    {
    "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
    "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":4,
    "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0031"
    },
    {
    "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":2,
    "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":2,
    "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0032"
    },
    {
    "\u041f\u0435\u0440\u0435\u043c\u0435\u043d\u043d\u0430\u044f":1,
    "\u0421\u0440\u0435\u0434\u043d\u0435\u0435":42,
    "\u0421\u0442\u0440\u043e\u043a\u0430":"\u0427\u0442\u043e\u002d\u0442\u043e\u0033"
    }
    ]
    }
    ;
    run;

    输出文件 .json :
    {
    "SASJSONExport":"1.0",
    "SASTableData+TEST": [
    {
    "Переменная":2,
    "Среднее":4,
    "Строка":"Что-то1"
    },
    {
    "Переменная":2,
    "Среднее":2,
    "Строка":"Что-то2"
    },
    {
    "Переменная":1,
    "Среднее":42,
    "Строка":"Что-то3"
    }
    ]
    }

    然后创建json映射文件 .json.map :
    filename jsmf '~/sasuser.v94/.json.map' encoding='utf-8';
    libname jsm json fileref=jsf map=jsmf automap=create;
    .json.map内容:
    {
    "DATASETS": [
    {
    "DSNAME": "root",
    "TABLEPATH": "/root",
    "VARIABLES": [
    {
    "NAME": "ordinal_root",
    "TYPE": "ORDINAL",
    "PATH": "/root"
    },
    {
    "NAME": "SASJSONExport",
    "TYPE": "CHARACTER",
    "PATH": "/root/SASJSONExport",
    "CURRENT_LENGTH": 3
    }
    ]
    },
    {
    "DSNAME": "SASTableData_TEST",
    "TABLEPATH": "/root/SASTableData+TEST",
    "VARIABLES": [
    {
    "NAME": "ordinal_root",
    "TYPE": "ORDINAL",
    "PATH": "/root"
    },
    {
    "NAME": "ordinal_SASTableData_TEST",
    "TYPE": "ORDINAL",
    "PATH": "/root/SASTableData+TEST"
    },
    {
    "NAME": "____________________",
    "TYPE": "NUMERIC",
    "PATH": "/root/SASTableData+TEST/Переменная"
    },
    {
    "NAME": "______________",
    "TYPE": "NUMERIC",
    "PATH": "/root/SASTableData+TEST/Среднее"
    },
    {
    "NAME": "____________",
    "TYPE": "CHARACTER",
    "PATH": "/root/SASTableData+TEST/Строка",
    "CURRENT_LENGTH": 12
    }
    ]
    }
    ]
    }

    让我们通过删除不必要的数据集的描述并添加标签来稍微更改文件:
    {
    "DATASETS": [
    {
    "DSNAME": "SASTableData_TEST",
    "TABLEPATH": "/root/SASTableData+TEST",
    "VARIABLES": [
    {
    "NAME": "ordinal_root",
    "TYPE": "ORDINAL",
    "PATH": "/root"
    },
    {
    "NAME": "ordinal_SASTableData_TEST",
    "TYPE": "ORDINAL",
    "PATH": "/root/SASTableData+TEST"
    },
    {
    "NAME": "____________________",
    "TYPE": "NUMERIC",
    "PATH": "/root/SASTableData+TEST/Переменная",
    "LABEL": "Переменная"
    },
    {
    "NAME": "______________",
    "TYPE": "NUMERIC",
    "PATH": "/root/SASTableData+TEST/Среднее",
    "LABEL": "Среднее"
    },
    {
    "NAME": "____________",
    "TYPE": "CHARACTER",
    "PATH": "/root/SASTableData+TEST/Строка",
    "LABEL": "Строка",
    "CURRENT_LENGTH": 12
    }
    ]
    }
    ]
    }

    再试一次:
    libname jsd json fileref=jsf map=jsmf;
    proc print data=jsd.SASTableData_TEST label noobs; run;

    结果:
    +--------------+---------------------------+- ----------+---------+-----------+
    | ordinal_root | ordinal_SASTableData_TEST | Переменная | Среднее | Строка |
    +--------------+---------------------------+------------+---------+-----------+
    | 1 | 1 | 2 | 4 | Что-то1 |
    | 1 | 2 | 2 | 2 | Что-то2 |
    | 1 | 3 | 1 | 42 | Что-то3 |
    +--------------+---------------------------+------------+---------+-----------+

    所有这一切都是在 SAS 大学版中完成的。

    关于json - 使用 SAS BASE 解析来自 JSON 的变量中带有 unicode 字符的表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/44175816/

    27 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com