diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/docling-ibm-models_slanet_1m.iml b/.idea/docling-ibm-models_slanet_1m.iml new file mode 100644 index 0000000..266b601 --- /dev/null +++ b/.idea/docling-ibm-models_slanet_1m.iml @@ -0,0 +1,15 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..812ab5a --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..e796249 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/docling_ibm_models/slanet_1m/.gitignore b/docling_ibm_models/slanet_1m/.gitignore new file mode 100644 index 0000000..85db5a4 --- /dev/null +++ b/docling_ibm_models/slanet_1m/.gitignore @@ -0,0 +1,34 @@ +## Python + +# Environments +.venv +venv + +# Byte-compiled / optimized / DLL files +__pycache__/ + +# Pytest cache +.pytest_cache + +# Pytest Coverage +.coverage + +## IntelliJ's IDEs + +.idea + +## Visual Studio Code + +.vscode + +## macOS + +.DS_Store + + +inference/ +inference_results/ +output/ +data/ +/data +evaluation/ diff --git a/docling_ibm_models/slanet_1m/12_tables/12_table_1.jpg b/docling_ibm_models/slanet_1m/12_tables/12_table_1.jpg new file mode 100644 index 0000000..abdbdcc Binary files /dev/null and b/docling_ibm_models/slanet_1m/12_tables/12_table_1.jpg differ diff --git a/docling_ibm_models/slanet_1m/12_tables/12_table_2.jpg b/docling_ibm_models/slanet_1m/12_tables/12_table_2.jpg new file mode 100644 index 0000000..1d5ffd6 Binary files /dev/null and b/docling_ibm_models/slanet_1m/12_tables/12_table_2.jpg differ diff --git a/docling_ibm_models/slanet_1m/Fonts/ARIALN.TTF b/docling_ibm_models/slanet_1m/Fonts/ARIALN.TTF new file mode 100644 index 0000000..94907a3 Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/ARIALN.TTF differ diff --git a/docling_ibm_models/slanet_1m/Fonts/ARIALNB.TTF b/docling_ibm_models/slanet_1m/Fonts/ARIALNB.TTF new file mode 100644 index 0000000..62437f0 Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/ARIALNB.TTF differ diff --git a/docling_ibm_models/slanet_1m/Fonts/ARIALNBI.TTF b/docling_ibm_models/slanet_1m/Fonts/ARIALNBI.TTF new file mode 100644 index 0000000..d3f019a Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/ARIALNBI.TTF differ diff --git a/docling_ibm_models/slanet_1m/Fonts/ARIALNI.TTF b/docling_ibm_models/slanet_1m/Fonts/ARIALNI.TTF new file mode 100644 index 0000000..4acd468 Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/ARIALNI.TTF differ diff --git a/docling_ibm_models/slanet_1m/Fonts/arial.ttf b/docling_ibm_models/slanet_1m/Fonts/arial.ttf new file mode 100644 index 0000000..27372d9 Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/arial.ttf differ diff --git a/docling_ibm_models/slanet_1m/Fonts/arialbd.ttf b/docling_ibm_models/slanet_1m/Fonts/arialbd.ttf new file mode 100644 index 0000000..03bb5e2 Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/arialbd.ttf differ diff --git a/docling_ibm_models/slanet_1m/Fonts/arialbi.ttf b/docling_ibm_models/slanet_1m/Fonts/arialbi.ttf new file mode 100644 index 0000000..dc80b4e Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/arialbi.ttf differ diff --git a/docling_ibm_models/slanet_1m/Fonts/ariali.ttf b/docling_ibm_models/slanet_1m/Fonts/ariali.ttf new file mode 100644 index 0000000..652df71 Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/ariali.ttf differ diff --git a/docling_ibm_models/slanet_1m/Fonts/ariblk.ttf b/docling_ibm_models/slanet_1m/Fonts/ariblk.ttf new file mode 100644 index 0000000..e7ae345 Binary files /dev/null and b/docling_ibm_models/slanet_1m/Fonts/ariblk.ttf differ diff --git a/docling_ibm_models/slanet_1m/README.md b/docling_ibm_models/slanet_1m/README.md new file mode 100644 index 0000000..d1db72b --- /dev/null +++ b/docling_ibm_models/slanet_1m/README.md @@ -0,0 +1,19 @@ +# SLANet_1M + +- Install PaddlePaddle with CUDA 12.3 + + ```bash linenums="1" + python -m pip install paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ + ``` + +- Then + ```bash linenums="1" + pip install -r requirements.txt + ``` + +- To train: + ```bash linenums="1" + python train.py -c configs/SLANet_1M.yml -o Global.use_amp=True Global.scale_loss=1024.0 Global.use_dynamic_loss_scaling=True + ``` + +Pre-trained Model on PubTanNet + SynthTabNet can be found [here](https://drive.google.com/drive/folders/1aIzP3a3Ci0n9hXD2j57Dq4uCfQlt8yoW?usp=drive_link) \ No newline at end of file diff --git a/docling_ibm_models/slanet_1m/__init__.py b/docling_ibm_models/slanet_1m/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docling_ibm_models/slanet_1m/configs/SLANet_1M.yml b/docling_ibm_models/slanet_1m/configs/SLANet_1M.yml new file mode 100644 index 0000000..946daf8 --- /dev/null +++ b/docling_ibm_models/slanet_1m/configs/SLANet_1M.yml @@ -0,0 +1,145 @@ +Global: + use_gpu: true + epoch_num: 50 + log_smooth_window: 20 + print_batch_step: 20 + save_model_dir: ./output/SLANet_1M + save_epoch_step: 400 + # evaluation is run every 1000 iterations after the 0th iteration + eval_batch_step: [0, 2000] + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: ./output/SLANet_1M/infer + use_visualdl: False + infer_img: + # for data or label process + character_dict_path: dict/table_structure_dict.txt + character_type: en + max_text_length: &max_text_length 500 + box_format: &box_format 'xyxy' # 'xywh', 'xyxy', 'xyxyxyxy' + infer_mode: False + use_sync_bn: True + save_res_path: 'output/infer' + d2s_train_image_shape: [3, -1, -1] + amp_custom_white_list: ['concat', 'elementwise_sub', 'set_value'] + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + clip_norm: 5.0 + lr: + name: Piecewise + learning_rate: 0.001 + decay_epochs : [29, 39] + values : [0.001, 0.0001, 0.00005] + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: table + algorithm: SLANet + Backbone: + name: PPLCNet + scale: 1.0 + pretrained: true + use_ssld: true + Neck: + name: CSPPAN + out_channels: 96 + Head: + name: SLAHead + hidden_size: 256 + max_text_length: *max_text_length + loc_reg_num: &loc_reg_num 4 + +Loss: + name: SLALoss + structure_weight: 1.0 + loc_weight: 2.0 + loc_loss: smooth_l1 + +PostProcess: + name: TableLabelDecode + merge_no_span_structure: &merge_no_span_structure True + +Metric: + name: TableMetric + main_indicator: acc + compute_bbox_metric: False + loc_reg_num: *loc_reg_num + box_format: *box_format + +Train: + dataset: + name: PubTabDataSet + data_dir: data/final_merged/train/ + label_file_list: [data/final_merged/train_annotations.jsonl] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'structure', 'bboxes', 'bbox_masks', 'length', 'shape'] + loader: + shuffle: True + batch_size_per_card: 72 + drop_last: True + num_workers: 1 + +Eval: + dataset: + name: PubTabDataSet + data_dir: data/final_merged/val/ + label_file_list: [data/final_merged/val_annotations.jsonl] + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - TableLabelEncode: + learn_empty_box: False + merge_no_span_structure: *merge_no_span_structure + replace_empty_cell_token: False + loc_reg_num: *loc_reg_num + max_text_length: *max_text_length + - TableBoxEncode: + in_box_format: *box_format + out_box_format: *box_format + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: 'hwc' + - PaddingTableImage: + size: [488, 488] + - ToCHWImage: + - KeepKeys: + keep_keys: ['image', 'structure', 'bboxes', 'bbox_masks', 'length', 'shape'] + loader: + shuffle: False + drop_last: False + batch_size_per_card: 72 + num_workers: 1 diff --git a/docling_ibm_models/slanet_1m/dict/table_structure_dict.txt b/docling_ibm_models/slanet_1m/dict/table_structure_dict.txt new file mode 100644 index 0000000..fec6f7d --- /dev/null +++ b/docling_ibm_models/slanet_1m/dict/table_structure_dict.txt @@ -0,0 +1,28 @@ + + + + + + + + + + colspan="2" + colspan="3" + rowspan="2" + colspan="4" + colspan="6" + rowspan="3" + colspan="9" + colspan="10" + colspan="7" + rowspan="4" + rowspan="5" + rowspan="9" + colspan="8" + rowspan="8" + rowspan="6" + rowspan="7" + rowspan="10" diff --git a/docling_ibm_models/slanet_1m/dict_table/en_dict.txt b/docling_ibm_models/slanet_1m/dict_table/en_dict.txt new file mode 100644 index 0000000..7677d31 --- /dev/null +++ b/docling_ibm_models/slanet_1m/dict_table/en_dict.txt @@ -0,0 +1,95 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ + diff --git a/docling_ibm_models/slanet_1m/dict_table/ppocr_keys_v1.txt b/docling_ibm_models/slanet_1m/dict_table/ppocr_keys_v1.txt new file mode 100644 index 0000000..b75af21 --- /dev/null +++ b/docling_ibm_models/slanet_1m/dict_table/ppocr_keys_v1.txt @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 diff --git a/docling_ibm_models/slanet_1m/dict_table/table_dict.txt b/docling_ibm_models/slanet_1m/dict_table/table_dict.txt new file mode 100644 index 0000000..2ef028c --- /dev/null +++ b/docling_ibm_models/slanet_1m/dict_table/table_dict.txt @@ -0,0 +1,277 @@ +← + +☆ +─ +α + + +⋅ +$ +ω +ψ +χ +( +υ +≥ +σ +, +ρ +ε +0 +■ +4 +8 +✗ +b +< +✓ +Ψ +Ω +€ +D +3 +Π +H +║ + +L +Φ +Χ +θ +P +κ +λ +μ +T +ξ +X +β +γ +δ +\ +ζ +η +` +d + +h +f +l +Θ +p +√ +t + +x +Β +Γ +Δ +| +ǂ +ɛ +j +̧ +➢ +⁡ +̌ +′ +« +△ +▲ +# + +' +Ι ++ +¶ +/ +▼ +⇑ +□ +· +7 +▪ +; +? +➔ +∩ +C +÷ +G +⇒ +K + +O +S +С +W +Α +[ +○ +_ +● +‡ +c +z +g + +o + +〈 +〉 +s +⩽ +w +φ +ʹ +{ +» +∣ +̆ +e +ˆ +∈ +τ +◆ +ι +∅ +∆ +∙ +∘ +Ø +ß +✔ +∞ +∑ +− +× +◊ +∗ +∖ +˃ +˂ +∫ +" +i +& +π +↔ +* +∥ +æ +∧ +. +⁄ +ø +Q +∼ +6 +⁎ +: +★ +> +a +B +≈ +F +J +̄ +N +♯ +R +V + +― +Z +♣ +^ +¤ +¥ +§ + +¢ +£ +≦ +­ +≤ +‖ +Λ +© +n +↓ +→ +↑ +r +° +± +v + +♂ +k +♀ +~ +ᅟ +̇ +@ +” +♦ +ł +® +⊕ +„ +! + +% +⇓ +) +- +1 +5 +9 += +А +A +‰ +⋆ +Σ +E +◦ +I +※ +M +m +̨ +⩾ +† + +• +U +Y +
 +] +̸ +2 +‐ +– +‒ +̂ +— +̀ +́ +’ +‘ +⋮ +⋯ +̊ +“ +̈ +≧ +q +u +ı +y + +​ +̃ +} +ν diff --git a/docling_ibm_models/slanet_1m/dict_table/table_structure_dict.txt b/docling_ibm_models/slanet_1m/dict_table/table_structure_dict.txt new file mode 100644 index 0000000..fec6f7d --- /dev/null +++ b/docling_ibm_models/slanet_1m/dict_table/table_structure_dict.txt @@ -0,0 +1,28 @@ + + + + + + + + + + colspan="2" + colspan="3" + rowspan="2" + colspan="4" + colspan="6" + rowspan="3" + colspan="9" + colspan="10" + colspan="7" + rowspan="4" + rowspan="5" + rowspan="9" + colspan="8" + rowspan="8" + rowspan="6" + rowspan="7" + rowspan="10" diff --git a/docling_ibm_models/slanet_1m/export_model.py b/docling_ibm_models/slanet_1m/export_model.py new file mode 100644 index 0000000..7e88ee6 --- /dev/null +++ b/docling_ibm_models/slanet_1m/export_model.py @@ -0,0 +1,295 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, ".."))) + +import argparse + +import paddle +from paddle.jit import to_static + +from modeling.architectures import build_model +from paddleocr.ppocr.postprocess import build_post_process +from paddleocr.ppocr.utils.save_load import load_model +from paddleocr.ppocr.utils.logging import get_logger +from program import load_config, merge_config, ArgsParser + + +def export_single_model( + model, arch_config, save_path, logger, input_shape=None, quanter=None +): + if arch_config["algorithm"] == "SRN": + max_text_length = arch_config["Head"]["max_text_length"] + other_shape = [ + paddle.static.InputSpec(shape=[None, 1, 64, 256], dtype="float32"), + [ + paddle.static.InputSpec(shape=[None, 256, 1], dtype="int64"), + paddle.static.InputSpec( + shape=[None, max_text_length, 1], dtype="int64" + ), + paddle.static.InputSpec( + shape=[None, 8, max_text_length, max_text_length], dtype="int64" + ), + paddle.static.InputSpec( + shape=[None, 8, max_text_length, max_text_length], dtype="int64" + ), + ], + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "SAR": + other_shape = [ + paddle.static.InputSpec(shape=[None, 3, 48, 160], dtype="float32"), + [paddle.static.InputSpec(shape=[None], dtype="float32")], + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["SVTR_LCNet", "SVTR_HGNet"]: + other_shape = [ + paddle.static.InputSpec(shape=[None, 3, 48, -1], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["SVTR", "CPPD"]: + other_shape = [ + paddle.static.InputSpec(shape=[None] + input_shape, dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "PREN": + other_shape = [ + paddle.static.InputSpec(shape=[None, 3, 64, 256], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["model_type"] == "sr": + other_shape = [ + paddle.static.InputSpec(shape=[None, 3, 16, 64], dtype="float32") + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "ViTSTR": + other_shape = [ + paddle.static.InputSpec(shape=[None, 1, 224, 224], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "ABINet": + if not input_shape: + input_shape = [3, 32, 128] + other_shape = [ + paddle.static.InputSpec(shape=[None] + input_shape, dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["NRTR", "SPIN", "RFL"]: + other_shape = [ + paddle.static.InputSpec(shape=[None, 1, 32, 100], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["SATRN"]: + other_shape = [ + paddle.static.InputSpec(shape=[None, 3, 32, 100], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "VisionLAN": + other_shape = [ + paddle.static.InputSpec(shape=[None, 3, 64, 256], dtype="float32"), + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "RobustScanner": + max_text_length = arch_config["Head"]["max_text_length"] + other_shape = [ + paddle.static.InputSpec(shape=[None, 3, 48, 160], dtype="float32"), + [ + paddle.static.InputSpec( + shape=[ + None, + ], + dtype="float32", + ), + paddle.static.InputSpec(shape=[None, max_text_length], dtype="int64"), + ], + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "CAN": + other_shape = [ + [ + paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"), + paddle.static.InputSpec(shape=[None, 1, None, None], dtype="float32"), + paddle.static.InputSpec( + shape=[None, arch_config["Head"]["max_text_length"]], dtype="int64" + ), + ] + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]: + input_spec = [ + paddle.static.InputSpec(shape=[None, 512], dtype="int64"), # input_ids + paddle.static.InputSpec(shape=[None, 512, 4], dtype="int64"), # bbox + paddle.static.InputSpec(shape=[None, 512], dtype="int64"), # attention_mask + paddle.static.InputSpec(shape=[None, 512], dtype="int64"), # token_type_ids + paddle.static.InputSpec(shape=[None, 3, 224, 224], dtype="int64"), # image + ] + if "Re" in arch_config["Backbone"]["name"]: + input_spec.extend( + [ + paddle.static.InputSpec( + shape=[None, 512, 3], dtype="int64" + ), # entities + paddle.static.InputSpec( + shape=[None, None, 2], dtype="int64" + ), # relations + ] + ) + if model.backbone.use_visual_backbone is False: + input_spec.pop(4) + model = to_static(model, input_spec=[input_spec]) + else: + infer_shape = [3, -1, -1] + if arch_config["model_type"] == "rec": + infer_shape = [3, 32, -1] # for rec model, H must be 32 + if ( + "Transform" in arch_config + and arch_config["Transform"] is not None + and arch_config["Transform"]["name"] == "TPS" + ): + logger.info( + "When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training" + ) + infer_shape[-1] = 100 + elif arch_config["model_type"] == "table": + infer_shape = [3, 488, 488] + if arch_config["algorithm"] == "TableMaster": + infer_shape = [3, 480, 480] + if arch_config["algorithm"] == "SLANet": + infer_shape = [3, -1, -1] + model = to_static( + model, + input_spec=[ + paddle.static.InputSpec(shape=[None] + infer_shape, dtype="float32") + ], + ) + + if ( + arch_config["model_type"] != "sr" + and arch_config["Backbone"]["name"] == "PPLCNetV3" + ): + # for rep lcnetv3 + for layer in model.sublayers(): + if hasattr(layer, "rep") and not getattr(layer, "is_repped"): + layer.rep() + + if quanter is None: + paddle.jit.save(model, save_path) + else: + quanter.save_quantized_model(model, save_path) + logger.info("inference model is saved to {}".format(save_path)) + return + + +def main(): + FLAGS = ArgsParser().parse_args() + config = load_config(FLAGS.config) + config = merge_config(config, FLAGS.opt) + logger = get_logger() + # build post process + + post_process_class = build_post_process(config["PostProcess"], config["Global"]) + + # build model + # for rec algorithm + if hasattr(post_process_class, "character"): + char_num = len(getattr(post_process_class, "character")) + if config["Architecture"]["algorithm"] in [ + "Distillation", + ]: # distillation model + for key in config["Architecture"]["Models"]: + if ( + config["Architecture"]["Models"][key]["Head"]["name"] == "MultiHead" + ): # multi head + out_channels_list = {} + if config["PostProcess"]["name"] == "DistillationSARLabelDecode": + char_num = char_num - 2 + if config["PostProcess"]["name"] == "DistillationNRTRLabelDecode": + char_num = char_num - 3 + out_channels_list["CTCLabelDecode"] = char_num + out_channels_list["SARLabelDecode"] = char_num + 2 + out_channels_list["NRTRLabelDecode"] = char_num + 3 + config["Architecture"]["Models"][key]["Head"][ + "out_channels_list" + ] = out_channels_list + else: + config["Architecture"]["Models"][key]["Head"][ + "out_channels" + ] = char_num + # just one final tensor needs to exported for inference + config["Architecture"]["Models"][key]["return_all_feats"] = False + elif config["Architecture"]["Head"]["name"] == "MultiHead": # multi head + out_channels_list = {} + char_num = len(getattr(post_process_class, "character")) + if config["PostProcess"]["name"] == "SARLabelDecode": + char_num = char_num - 2 + if config["PostProcess"]["name"] == "NRTRLabelDecode": + char_num = char_num - 3 + out_channels_list["CTCLabelDecode"] = char_num + out_channels_list["SARLabelDecode"] = char_num + 2 + out_channels_list["NRTRLabelDecode"] = char_num + 3 + config["Architecture"]["Head"]["out_channels_list"] = out_channels_list + else: # base rec model + config["Architecture"]["Head"]["out_channels"] = char_num + + # for sr algorithm + if config["Architecture"]["model_type"] == "sr": + config["Architecture"]["Transform"]["infer_mode"] = True + model = build_model(config["Architecture"]) + load_model(config, model, model_type=config["Architecture"]["model_type"]) + model.eval() + + save_path = config["Global"]["save_inference_dir"] + + arch_config = config["Architecture"] + + if ( + arch_config["algorithm"] in ["SVTR", "CPPD"] + and arch_config["Head"]["name"] != "MultiHead" + ): + input_shape = config["Eval"]["dataset"]["transforms"][-2]["SVTRRecResizeImg"][ + "image_shape" + ] + elif arch_config["algorithm"].lower() == "ABINet".lower(): + rec_rs = [ + c + for c in config["Eval"]["dataset"]["transforms"] + if "ABINetRecResizeImg" in c + ] + input_shape = rec_rs[0]["ABINetRecResizeImg"]["image_shape"] if rec_rs else None + else: + input_shape = None + + if arch_config["algorithm"] in [ + "Distillation", + ]: # distillation model + archs = list(arch_config["Models"].values()) + for idx, name in enumerate(model.model_name_list): + sub_model_save_path = os.path.join(save_path, name, "inference") + export_single_model( + model.model_list[idx], archs[idx], sub_model_save_path, logger + ) + else: + save_path = os.path.join(save_path, "inference") + export_single_model( + model, arch_config, save_path, logger, input_shape=input_shape + ) + + +if __name__ == "__main__": + main() diff --git a/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdiparams b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdiparams new file mode 100644 index 0000000..2efedca Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdiparams differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdiparams.info b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdiparams.info new file mode 100644 index 0000000..622d87b Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdiparams.info differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdmodel b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdmodel new file mode 100644 index 0000000..0a6bf1e Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_det_infer/inference.pdmodel differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdiparams b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdiparams new file mode 100644 index 0000000..26ba0c9 Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdiparams differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdiparams.info b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdiparams.info new file mode 100644 index 0000000..1cdccfc Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdiparams.info differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdmodel b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdmodel new file mode 100644 index 0000000..5dfe4cf Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_PP-OCRv3_rec_infer/inference.pdmodel differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdiparams b/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdiparams new file mode 100644 index 0000000..3a12bbe Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdiparams differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdiparams.info b/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdiparams.info new file mode 100644 index 0000000..55c45ce Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdiparams.info differ diff --git a/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdmodel b/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdmodel new file mode 100644 index 0000000..c522762 Binary files /dev/null and b/docling_ibm_models/slanet_1m/inference_table/en_ppstructure_mobile_v2.0_SLANet_infer/inference.pdmodel differ diff --git a/docling_ibm_models/slanet_1m/kubernetes/train-job.yaml b/docling_ibm_models/slanet_1m/kubernetes/train-job.yaml new file mode 100644 index 0000000..b1af14c --- /dev/null +++ b/docling_ibm_models/slanet_1m/kubernetes/train-job.yaml @@ -0,0 +1,88 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: train-job-${CI_PIPELINE_ID} + namespace: $NAMESPACE +spec: + template: + spec: + containers: + - name: train-container + image: python:3.11 + command: [ "bash", "-c" ] + args: + - | + # Install MinIO client + curl https://dl.min.io/client/mc/release/linux-amd64/mc --create-dirs -o ~/minio-binaries/mc + chmod +x $HOME/minio-binaries/mc + export PATH=$PATH:$HOME/minio-binaries/ + + # Set alias for MinIO server + mc alias set minio $ENDPOINT_URL $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + + # Clone the repository + git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${CI_SERVER_HOST}/${CI_PROJECT_PATH}.git /repo + cd /repo + + # Checkout the branch + git checkout $CI_COMMIT_REF_NAME + echo "Checking out branch $CI_COMMIT_REF_NAME" + + # Set up environment + echo "BUCKET=$BUCKET" >> .env + echo "ENDPOINT_URL=$ENDPOINT_URL" >> .env + echo "REGION=$REGION" >> .env + echo "MODELS_BUCKET=$MODELS_BUCKET" >> .env + echo "AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID" >> .env + echo "AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY" >> .env + export $(cat .env | xargs) + + # Install dependencies + python3.11 -m venv .venv + source .venv/bin/activate + pip install --requirement requirements.txt --no-cache-dir + pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ + apt update && apt install --yes ffmpeg libsm6 libxext6 + + # Run training + dvc repro --pull + + # Save the experiment metadata + dvc params diff main --md > experiment_report.md + dvc metrics diff main --md >> experiment_report.md + + # Push the experiment metadata to MinIO + dvc push + + mc cp experiment_report.md minio/$MODELS_BUCKET/experiments/experiment_report_${CI_MERGE_REQUEST_IID}.md + mc cp dvc.lock minio/$MODELS_BUCKET/experiments/dvc_lock_${CI_MERGE_REQUEST_IID}.lock + volumeMounts: + - name: repo-volume + mountPath: /repo + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: access_key_id + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: secret_access_key + - name: BUCKET + value: $BUCKET + - name: ENDPOINT_URL + value: $ENDPOINT_URL + - name: REGION + value: $REGION + - name: MODELS_BUCKET + value: $MODELS_BUCKET + resources: + limits: + nvidia.com/gpu-rtx-4090-24gb: 1 + restartPolicy: Never + volumes: + - name: repo-volume + emptyDir: { } + backoffLimit: 2 diff --git a/docling_ibm_models/slanet_1m/losses/__init__.py b/docling_ibm_models/slanet_1m/losses/__init__.py new file mode 100644 index 0000000..0b170d6 --- /dev/null +++ b/docling_ibm_models/slanet_1m/losses/__init__.py @@ -0,0 +1,36 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import paddle +import paddle.nn as nn + + +# table loss +from .table_att_loss import TableAttentionLoss, SLALoss + + +def build_loss(config): + support_dict = [ + "TableAttentionLoss", + "SLALoss", + + ] + config = copy.deepcopy(config) + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "loss only support {}".format(support_dict) + ) + module_class = eval(module_name)(**config) + return module_class diff --git a/docling_ibm_models/slanet_1m/losses/table_att_loss.py b/docling_ibm_models/slanet_1m/losses/table_att_loss.py new file mode 100644 index 0000000..5f0e780 --- /dev/null +++ b/docling_ibm_models/slanet_1m/losses/table_att_loss.py @@ -0,0 +1,100 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +from paddle.nn import functional as F + + +class TableAttentionLoss(nn.Layer): + def __init__(self, structure_weight, loc_weight, **kwargs): + super(TableAttentionLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction="none") + self.structure_weight = structure_weight + self.loc_weight = loc_weight + + def forward(self, predicts, batch): + structure_probs = predicts["structure_probs"] + structure_targets = batch[1].astype("int64") + structure_targets = structure_targets[:, 1:] + structure_probs = paddle.reshape( + structure_probs, [-1, structure_probs.shape[-1]] + ) + structure_targets = paddle.reshape(structure_targets, [-1]) + structure_loss = self.loss_func(structure_probs, structure_targets) + + structure_loss = paddle.mean(structure_loss) * self.structure_weight + + loc_preds = predicts["loc_preds"] + loc_targets = batch[2].astype("float32") + loc_targets_mask = batch[3].astype("float32") + loc_targets = loc_targets[:, 1:, :] + loc_targets_mask = loc_targets_mask[:, 1:, :] + loc_loss = ( + F.mse_loss(loc_preds * loc_targets_mask, loc_targets) * self.loc_weight + ) + + total_loss = structure_loss + loc_loss + return { + "loss": total_loss, + "structure_loss": structure_loss, + "loc_loss": loc_loss, + } + + +class SLALoss(nn.Layer): + def __init__(self, structure_weight, loc_weight, loc_loss="mse", **kwargs): + super(SLALoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction="mean") + self.structure_weight = structure_weight + self.loc_weight = loc_weight + self.loc_loss = loc_loss + self.eps = 1e-12 + + def forward(self, predicts, batch): + structure_probs = predicts["structure_probs"] + structure_targets = batch[1].astype("int64") + max_len = batch[-2].max() + structure_targets = structure_targets[:, 1 : max_len + 2] + + structure_loss = self.loss_func(structure_probs, structure_targets) + + structure_loss = paddle.mean(structure_loss) * self.structure_weight + + loc_preds = predicts["loc_preds"] + loc_targets = batch[2].astype("float32") + loc_targets_mask = batch[3].astype("float32") + loc_targets = loc_targets[:, 1 : max_len + 2] + loc_targets_mask = loc_targets_mask[:, 1 : max_len + 2] + + loc_loss = ( + F.smooth_l1_loss( + loc_preds * loc_targets_mask, + loc_targets * loc_targets_mask, + reduction="sum", + ) + * self.loc_weight + ) + + loc_loss = loc_loss / (loc_targets_mask.sum() + self.eps) + total_loss = structure_loss + loc_loss + return { + "loss": total_loss, + "structure_loss": structure_loss, + "loc_loss": loc_loss, + } diff --git a/docling_ibm_models/slanet_1m/metrics/__init__.py b/docling_ibm_models/slanet_1m/metrics/__init__.py new file mode 100644 index 0000000..83ddcd7 --- /dev/null +++ b/docling_ibm_models/slanet_1m/metrics/__init__.py @@ -0,0 +1,54 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy + +__all__ = ["build_metric"] + +from .det_metric import DetMetric, DetFCEMetric +from .rec_metric import RecMetric, CNTMetric, CANMetric +from .table_metric import TableMetric + + + +def build_metric(config): + support_dict = [ + "DetMetric", + "DetFCEMetric", + "RecMetric", + "ClsMetric", + "E2EMetric", + "DistillationMetric", + "TableMetric", + "KIEMetric", + "VQASerTokenMetric", + "VQAReTokenMetric", + "SRMetric", + "CTMetric", + "CNTMetric", + "CANMetric", + ] + + config = copy.deepcopy(config) + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "metric only support {}".format(support_dict) + ) + module_class = eval(module_name)(**config) + return module_class diff --git a/docling_ibm_models/slanet_1m/metrics/det_metric.py b/docling_ibm_models/slanet_1m/metrics/det_metric.py new file mode 100644 index 0000000..be95ec3 --- /dev/null +++ b/docling_ibm_models/slanet_1m/metrics/det_metric.py @@ -0,0 +1,153 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +__all__ = ["DetMetric", "DetFCEMetric"] + +from .eval_det_iou import DetectionIoUEvaluator + + +class DetMetric(object): + def __init__(self, main_indicator="hmean", **kwargs): + self.evaluator = DetectionIoUEvaluator() + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + """ + batch: a list produced by dataloaders. + image: np.ndarray of shape (N, C, H, W). + ratio_list: np.ndarray of shape(N,2) + polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not. + preds: a list of dict produced by post process + points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + """ + gt_polyons_batch = batch[2] + ignore_tags_batch = batch[3] + for pred, gt_polyons, ignore_tags in zip( + preds, gt_polyons_batch, ignore_tags_batch + ): + # prepare gt + gt_info_list = [ + {"points": gt_polyon, "text": "", "ignore": ignore_tag} + for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags) + ] + # prepare det + det_info_list = [ + {"points": det_polyon, "text": ""} for det_polyon in pred["points"] + ] + result = self.evaluator.evaluate_image(gt_info_list, det_info_list) + self.results.append(result) + + def get_metric(self): + """ + return metrics { + 'precision': 0, + 'recall': 0, + 'hmean': 0 + } + """ + + metrics = self.evaluator.combine_results(self.results) + self.reset() + return metrics + + def reset(self): + self.results = [] # clear results + + +class DetFCEMetric(object): + def __init__(self, main_indicator="hmean", **kwargs): + self.evaluator = DetectionIoUEvaluator() + self.main_indicator = main_indicator + self.reset() + + def __call__(self, preds, batch, **kwargs): + """ + batch: a list produced by dataloaders. + image: np.ndarray of shape (N, C, H, W). + ratio_list: np.ndarray of shape(N,2) + polygons: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + ignore_tags: np.ndarray of shape (N, K), indicates whether a region is ignorable or not. + preds: a list of dict produced by post process + points: np.ndarray of shape (N, K, 4, 2), the polygons of objective regions. + """ + gt_polyons_batch = batch[2] + ignore_tags_batch = batch[3] + + for pred, gt_polyons, ignore_tags in zip( + preds, gt_polyons_batch, ignore_tags_batch + ): + # prepare gt + gt_info_list = [ + {"points": gt_polyon, "text": "", "ignore": ignore_tag} + for gt_polyon, ignore_tag in zip(gt_polyons, ignore_tags) + ] + # prepare det + det_info_list = [ + {"points": det_polyon, "text": "", "score": score} + for det_polyon, score in zip(pred["points"], pred["scores"]) + ] + + for score_thr in self.results.keys(): + det_info_list_thr = [ + det_info + for det_info in det_info_list + if det_info["score"] >= score_thr + ] + result = self.evaluator.evaluate_image(gt_info_list, det_info_list_thr) + self.results[score_thr].append(result) + + def get_metric(self): + """ + return metrics {'heman':0, + 'thr 0.3':'precision: 0 recall: 0 hmean: 0', + 'thr 0.4':'precision: 0 recall: 0 hmean: 0', + 'thr 0.5':'precision: 0 recall: 0 hmean: 0', + 'thr 0.6':'precision: 0 recall: 0 hmean: 0', + 'thr 0.7':'precision: 0 recall: 0 hmean: 0', + 'thr 0.8':'precision: 0 recall: 0 hmean: 0', + 'thr 0.9':'precision: 0 recall: 0 hmean: 0', + } + """ + metrics = {} + hmean = 0 + for score_thr in self.results.keys(): + metric = self.evaluator.combine_results(self.results[score_thr]) + # for key, value in metric.items(): + # metrics['{}_{}'.format(key, score_thr)] = value + metric_str = "precision:{:.5f} recall:{:.5f} hmean:{:.5f}".format( + metric["precision"], metric["recall"], metric["hmean"] + ) + metrics["thr {}".format(score_thr)] = metric_str + hmean = max(hmean, metric["hmean"]) + metrics["hmean"] = hmean + + self.reset() + return metrics + + def reset(self): + self.results = { + 0.3: [], + 0.4: [], + 0.5: [], + 0.6: [], + 0.7: [], + 0.8: [], + 0.9: [], + } # clear results diff --git a/docling_ibm_models/slanet_1m/metrics/eval_det_iou.py b/docling_ibm_models/slanet_1m/metrics/eval_det_iou.py new file mode 100644 index 0000000..4ecce53 --- /dev/null +++ b/docling_ibm_models/slanet_1m/metrics/eval_det_iou.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from collections import namedtuple +import numpy as np +from shapely.geometry import Polygon + +""" +reference from: +https://github.com/MhLiao/DB/blob/3c32b808d4412680310d3d28eeb6a2d5bf1566c5/concern/icdar2015_eval/detection/iou.py#L8 +""" + + +class DetectionIoUEvaluator(object): + def __init__(self, iou_constraint=0.5, area_precision_constraint=0.5): + self.iou_constraint = iou_constraint + self.area_precision_constraint = area_precision_constraint + + def evaluate_image(self, gt, pred): + def get_union(pD, pG): + return Polygon(pD).union(Polygon(pG)).area + + def get_intersection_over_union(pD, pG): + return get_intersection(pD, pG) / get_union(pD, pG) + + def get_intersection(pD, pG): + return Polygon(pD).intersection(Polygon(pG)).area + + def compute_ap(confList, matchList, numGtCare): + correct = 0 + AP = 0 + if len(confList) > 0: + confList = np.array(confList) + matchList = np.array(matchList) + sorted_ind = np.argsort(-confList) + confList = confList[sorted_ind] + matchList = matchList[sorted_ind] + for n in range(len(confList)): + match = matchList[n] + if match: + correct += 1 + AP += float(correct) / (n + 1) + + if numGtCare > 0: + AP /= numGtCare + + return AP + + perSampleMetrics = {} + + matchedSum = 0 + + Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax") + + numGlobalCareGt = 0 + numGlobalCareDet = 0 + + arrGlobalConfidences = [] + arrGlobalMatches = [] + + recall = 0 + precision = 0 + hmean = 0 + + detMatched = 0 + + iouMat = np.empty([1, 1]) + + gtPols = [] + detPols = [] + + gtPolPoints = [] + detPolPoints = [] + + # Array of Ground Truth Polygons' keys marked as don't Care + gtDontCarePolsNum = [] + # Array of Detected Polygons' matched with a don't Care GT + detDontCarePolsNum = [] + + pairs = [] + detMatchedNums = [] + + arrSampleConfidences = [] + arrSampleMatch = [] + + evaluationLog = "" + + for n in range(len(gt)): + points = gt[n]["points"] + dontCare = gt[n]["ignore"] + if not Polygon(points).is_valid: + continue + + gtPol = points + gtPols.append(gtPol) + gtPolPoints.append(points) + if dontCare: + gtDontCarePolsNum.append(len(gtPols) - 1) + + evaluationLog += ( + "GT polygons: " + + str(len(gtPols)) + + ( + " (" + str(len(gtDontCarePolsNum)) + " don't care)\n" + if len(gtDontCarePolsNum) > 0 + else "\n" + ) + ) + + for n in range(len(pred)): + points = pred[n]["points"] + if not Polygon(points).is_valid: + continue + + detPol = points + detPols.append(detPol) + detPolPoints.append(points) + if len(gtDontCarePolsNum) > 0: + for dontCarePol in gtDontCarePolsNum: + dontCarePol = gtPols[dontCarePol] + intersected_area = get_intersection(dontCarePol, detPol) + pdDimensions = Polygon(detPol).area + precision = ( + 0 if pdDimensions == 0 else intersected_area / pdDimensions + ) + if precision > self.area_precision_constraint: + detDontCarePolsNum.append(len(detPols) - 1) + break + + evaluationLog += ( + "DET polygons: " + + str(len(detPols)) + + ( + " (" + str(len(detDontCarePolsNum)) + " don't care)\n" + if len(detDontCarePolsNum) > 0 + else "\n" + ) + ) + + if len(gtPols) > 0 and len(detPols) > 0: + # Calculate IoU and precision matrixs + outputShape = [len(gtPols), len(detPols)] + iouMat = np.empty(outputShape) + gtRectMat = np.zeros(len(gtPols), np.int8) + detRectMat = np.zeros(len(detPols), np.int8) + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + pG = gtPols[gtNum] + pD = detPols[detNum] + iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG) + + for gtNum in range(len(gtPols)): + for detNum in range(len(detPols)): + if ( + gtRectMat[gtNum] == 0 + and detRectMat[detNum] == 0 + and gtNum not in gtDontCarePolsNum + and detNum not in detDontCarePolsNum + ): + if iouMat[gtNum, detNum] > self.iou_constraint: + gtRectMat[gtNum] = 1 + detRectMat[detNum] = 1 + detMatched += 1 + pairs.append({"gt": gtNum, "det": detNum}) + detMatchedNums.append(detNum) + evaluationLog += ( + "Match GT #" + + str(gtNum) + + " with Det #" + + str(detNum) + + "\n" + ) + + numGtCare = len(gtPols) - len(gtDontCarePolsNum) + numDetCare = len(detPols) - len(detDontCarePolsNum) + if numGtCare == 0: + recall = float(1) + precision = float(0) if numDetCare > 0 else float(1) + else: + recall = float(detMatched) / numGtCare + precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare + + hmean = ( + 0 + if (precision + recall) == 0 + else 2.0 * precision * recall / (precision + recall) + ) + + matchedSum += detMatched + numGlobalCareGt += numGtCare + numGlobalCareDet += numDetCare + + perSampleMetrics = { + "gtCare": numGtCare, + "detCare": numDetCare, + "detMatched": detMatched, + } + return perSampleMetrics + + def combine_results(self, results): + numGlobalCareGt = 0 + numGlobalCareDet = 0 + matchedSum = 0 + for result in results: + numGlobalCareGt += result["gtCare"] + numGlobalCareDet += result["detCare"] + matchedSum += result["detMatched"] + + methodRecall = ( + 0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt + ) + methodPrecision = ( + 0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet + ) + methodHmean = ( + 0 + if methodRecall + methodPrecision == 0 + else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision) + ) + methodMetrics = { + "precision": methodPrecision, + "recall": methodRecall, + "hmean": methodHmean, + } + + return methodMetrics + + +if __name__ == "__main__": + evaluator = DetectionIoUEvaluator() + gts = [ + [ + { + "points": [(0, 0), (1, 0), (1, 1), (0, 1)], + "text": 1234, + "ignore": False, + }, + { + "points": [(2, 2), (3, 2), (3, 3), (2, 3)], + "text": 5678, + "ignore": False, + }, + ] + ] + preds = [ + [ + { + "points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)], + "text": 123, + "ignore": False, + } + ] + ] + results = [] + for gt, pred in zip(gts, preds): + results.append(evaluator.evaluate_image(gt, pred)) + metrics = evaluator.combine_results(results) + print(metrics) diff --git a/docling_ibm_models/slanet_1m/metrics/rec_metric.py b/docling_ibm_models/slanet_1m/metrics/rec_metric.py new file mode 100644 index 0000000..e41dd36 --- /dev/null +++ b/docling_ibm_models/slanet_1m/metrics/rec_metric.py @@ -0,0 +1,179 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from rapidfuzz.distance import Levenshtein +from difflib import SequenceMatcher + +import numpy as np +import string + + +class RecMetric(object): + def __init__( + self, main_indicator="acc", is_filter=False, ignore_space=True, **kwargs + ): + self.main_indicator = main_indicator + self.is_filter = is_filter + self.ignore_space = ignore_space + self.eps = 1e-5 + self.reset() + + def _normalize_text(self, text): + text = "".join( + filter(lambda x: x in (string.digits + string.ascii_letters), text) + ) + return text.lower() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + norm_edit_dis = 0.0 + for (pred, pred_conf), (target, _) in zip(preds, labels): + if self.ignore_space: + pred = pred.replace(" ", "") + target = target.replace(" ", "") + if self.is_filter: + pred = self._normalize_text(pred) + target = self._normalize_text(target) + norm_edit_dis += Levenshtein.normalized_distance(pred, target) + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + self.norm_edit_dis += norm_edit_dis + return { + "acc": correct_num / (all_num + self.eps), + "norm_edit_dis": 1 - norm_edit_dis / (all_num + self.eps), + } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + 'norm_edit_dis': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + norm_edit_dis = 1 - self.norm_edit_dis / (self.all_num + self.eps) + self.reset() + return {"acc": acc, "norm_edit_dis": norm_edit_dis} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + self.norm_edit_dis = 0 + + +class CNTMetric(object): + def __init__(self, main_indicator="acc", **kwargs): + self.main_indicator = main_indicator + self.eps = 1e-5 + self.reset() + + def __call__(self, pred_label, *args, **kwargs): + preds, labels = pred_label + correct_num = 0 + all_num = 0 + for pred, target in zip(preds, labels): + if pred == target: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + return { + "acc": correct_num / (all_num + self.eps), + } + + def get_metric(self): + """ + return metrics { + 'acc': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + self.reset() + return {"acc": acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + + +class CANMetric(object): + def __init__(self, main_indicator="exp_rate", **kwargs): + self.main_indicator = main_indicator + self.word_right = [] + self.exp_right = [] + self.word_total_length = 0 + self.exp_total_num = 0 + self.word_rate = 0 + self.exp_rate = 0 + self.reset() + self.epoch_reset() + + def __call__(self, preds, batch, **kwargs): + for k, v in kwargs.items(): + epoch_reset = v + if epoch_reset: + self.epoch_reset() + word_probs = preds + word_label, word_label_mask = batch + line_right = 0 + if word_probs is not None: + word_pred = word_probs.argmax(2) + word_pred = word_pred.cpu().detach().numpy() + word_scores = [ + SequenceMatcher( + None, s1[: int(np.sum(s3))], s2[: int(np.sum(s3))], autojunk=False + ).ratio() + * (len(s1[: int(np.sum(s3))]) + len(s2[: int(np.sum(s3))])) + / len(s1[: int(np.sum(s3))]) + / 2 + for s1, s2, s3 in zip(word_label, word_pred, word_label_mask) + ] + batch_size = len(word_scores) + for i in range(batch_size): + if word_scores[i] == 1: + line_right += 1 + self.word_rate = np.mean(word_scores) # float + self.exp_rate = line_right / batch_size # float + exp_length, word_length = word_label.shape[:2] + self.word_right.append(self.word_rate * word_length) + self.exp_right.append(self.exp_rate * exp_length) + self.word_total_length = self.word_total_length + word_length + self.exp_total_num = self.exp_total_num + exp_length + + def get_metric(self): + """ + return { + 'word_rate': 0, + "exp_rate": 0, + } + """ + cur_word_rate = sum(self.word_right) / self.word_total_length + cur_exp_rate = sum(self.exp_right) / self.exp_total_num + self.reset() + return {"word_rate": cur_word_rate, "exp_rate": cur_exp_rate} + + def reset(self): + self.word_rate = 0 + self.exp_rate = 0 + + def epoch_reset(self): + self.word_right = [] + self.exp_right = [] + self.word_total_length = 0 + self.exp_total_num = 0 diff --git a/docling_ibm_models/slanet_1m/metrics/table_metric.py b/docling_ibm_models/slanet_1m/metrics/table_metric.py new file mode 100644 index 0000000..6df2bb1 --- /dev/null +++ b/docling_ibm_models/slanet_1m/metrics/table_metric.py @@ -0,0 +1,161 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from metrics.det_metric import DetMetric + + +class TableStructureMetric(object): + def __init__(self, main_indicator="acc", eps=1e-6, del_thead_tbody=False, **kwargs): + self.main_indicator = main_indicator + self.eps = eps + self.del_thead_tbody = del_thead_tbody + self.reset() + + def __call__(self, pred_label, batch=None, *args, **kwargs): + preds, labels = pred_label + pred_structure_batch_list = preds["structure_batch_list"] + gt_structure_batch_list = labels["structure_batch_list"] + correct_num = 0 + all_num = 0 + for (pred, pred_conf), target in zip( + pred_structure_batch_list, gt_structure_batch_list + ): + pred_str = "".join(pred) + target_str = "".join(target) + if self.del_thead_tbody: + pred_str = ( + pred_str.replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + ) + target_str = ( + target_str.replace("", "") + .replace("", "") + .replace("", "") + .replace("", "") + ) + if pred_str == target_str: + correct_num += 1 + all_num += 1 + self.correct_num += correct_num + self.all_num += all_num + + def get_metric(self): + """ + return metrics { + 'acc': 0, + } + """ + acc = 1.0 * self.correct_num / (self.all_num + self.eps) + self.reset() + return {"acc": acc} + + def reset(self): + self.correct_num = 0 + self.all_num = 0 + self.len_acc_num = 0 + self.token_nums = 0 + self.anys_dict = dict() + + +class TableMetric(object): + def __init__( + self, + main_indicator="acc", + compute_bbox_metric=False, + box_format="xyxy", + del_thead_tbody=False, + **kwargs, + ): + """ + + @param sub_metrics: configs of sub_metric + @param main_matric: main_matric for save best_model + @param kwargs: + """ + self.structure_metric = TableStructureMetric(del_thead_tbody=del_thead_tbody) + self.bbox_metric = DetMetric() if compute_bbox_metric else None + self.main_indicator = main_indicator + self.box_format = box_format + self.reset() + + def __call__(self, pred_label, batch=None, *args, **kwargs): + self.structure_metric(pred_label) + if self.bbox_metric is not None: + self.bbox_metric(*self.prepare_bbox_metric_input(pred_label)) + + def prepare_bbox_metric_input(self, pred_label): + pred_bbox_batch_list = [] + gt_ignore_tags_batch_list = [] + gt_bbox_batch_list = [] + preds, labels = pred_label + + batch_num = len(preds["bbox_batch_list"]) + for batch_idx in range(batch_num): + # pred + pred_bbox_list = [ + self.format_box(pred_box) + for pred_box in preds["bbox_batch_list"][batch_idx] + ] + pred_bbox_batch_list.append({"points": pred_bbox_list}) + + # gt + gt_bbox_list = [] + gt_ignore_tags_list = [] + for gt_box in labels["bbox_batch_list"][batch_idx]: + gt_bbox_list.append(self.format_box(gt_box)) + gt_ignore_tags_list.append(0) + gt_bbox_batch_list.append(gt_bbox_list) + gt_ignore_tags_batch_list.append(gt_ignore_tags_list) + + return [ + pred_bbox_batch_list, + [0, 0, gt_bbox_batch_list, gt_ignore_tags_batch_list], + ] + + def get_metric(self): + structure_metric = self.structure_metric.get_metric() + if self.bbox_metric is None: + return structure_metric + bbox_metric = self.bbox_metric.get_metric() + if self.main_indicator == self.bbox_metric.main_indicator: + output = bbox_metric + for sub_key in structure_metric: + output["structure_metric_{}".format(sub_key)] = structure_metric[ + sub_key + ] + else: + output = structure_metric + for sub_key in bbox_metric: + output["bbox_metric_{}".format(sub_key)] = bbox_metric[sub_key] + return output + + def reset(self): + self.structure_metric.reset() + if self.bbox_metric is not None: + self.bbox_metric.reset() + + def format_box(self, box): + if self.box_format == "xyxy": + x1, y1, x2, y2 = box + box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + elif self.box_format == "xywh": + x, y, w, h = box + x1, y1, x2, y2 = x - w // 2, y - h // 2, x + w // 2, y + h // 2 + box = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]] + elif self.box_format == "xyxyxyxy": + x1, y1, x2, y2, x3, y3, x4, y4 = box + box = [[x1, y1], [x2, y2], [x3, y3], [x4, y4]] + return box diff --git a/docling_ibm_models/slanet_1m/model_final/inference.pdiparams b/docling_ibm_models/slanet_1m/model_final/inference.pdiparams new file mode 100644 index 0000000..88e0aed Binary files /dev/null and b/docling_ibm_models/slanet_1m/model_final/inference.pdiparams differ diff --git a/docling_ibm_models/slanet_1m/model_final/inference.pdiparams.info b/docling_ibm_models/slanet_1m/model_final/inference.pdiparams.info new file mode 100644 index 0000000..dee1aaf Binary files /dev/null and b/docling_ibm_models/slanet_1m/model_final/inference.pdiparams.info differ diff --git a/docling_ibm_models/slanet_1m/model_final/inference.pdmodel b/docling_ibm_models/slanet_1m/model_final/inference.pdmodel new file mode 100644 index 0000000..5f8e979 Binary files /dev/null and b/docling_ibm_models/slanet_1m/model_final/inference.pdmodel differ diff --git a/docling_ibm_models/slanet_1m/model_final/inference.yml b/docling_ibm_models/slanet_1m/model_final/inference.yml new file mode 100644 index 0000000..b83eae1 --- /dev/null +++ b/docling_ibm_models/slanet_1m/model_final/inference.yml @@ -0,0 +1,72 @@ +PreProcess: + transform_ops: + - DecodeImage: + channel_first: false + img_mode: BGR + - TableLabelEncode: + learn_empty_box: false + loc_reg_num: 4 + max_text_length: 500 + merge_no_span_structure: true + replace_empty_cell_token: false + - TableBoxEncode: + in_box_format: xyxy + out_box_format: xyxy + - ResizeTableImage: + max_len: 488 + - NormalizeImage: + mean: + - 0.485 + - 0.456 + - 0.406 + order: hwc + scale: 1./255. + std: + - 0.229 + - 0.224 + - 0.225 + - PaddingTableImage: + size: + - 488 + - 488 + - ToCHWImage: null + - KeepKeys: + keep_keys: + - image + - structure + - bboxes + - bbox_masks + - length + - shape +PostProcess: + name: TableLabelDecode + merge_no_span_structure: true + character_dict: + - + - + - + - + - + - + - + - + - ' + - ' colspan="2"' + - ' colspan="3"' + - ' rowspan="2"' + - ' colspan="4"' + - ' colspan="6"' + - ' rowspan="3"' + - ' colspan="9"' + - ' colspan="10"' + - ' colspan="7"' + - ' rowspan="4"' + - ' rowspan="5"' + - ' rowspan="9"' + - ' colspan="8"' + - ' rowspan="8"' + - ' rowspan="6"' + - ' rowspan="7"' + - ' rowspan="10"' diff --git a/docling_ibm_models/slanet_1m/modeling/architectures/__init__.py b/docling_ibm_models/slanet_1m/modeling/architectures/__init__.py new file mode 100644 index 0000000..50260f9 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/architectures/__init__.py @@ -0,0 +1,115 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import importlib + +from paddle.jit import to_static +from paddle.static import InputSpec + +from .base_model import BaseModel +from .distillation_model import DistillationModel + +__all__ = ["build_model", "apply_to_static"] + + +def build_model(config): + config = copy.deepcopy(config) + if not "name" in config: + arch = BaseModel(config) + else: + name = config.pop("name") + mod = importlib.import_module(__name__) + arch = getattr(mod, name)(config) + return arch + + +def apply_to_static(model, config, logger): + if config["Global"].get("to_static", False) is not True: + return model + assert ( + "d2s_train_image_shape" in config["Global"] + ), "d2s_train_image_shape must be assigned for static training mode..." + supported_list = ["DB", "SVTR_LCNet", "TableMaster", "LayoutXLM", "SLANet", "SVTR"] + if config["Architecture"]["algorithm"] in ["Distillation"]: + algo = list(config["Architecture"]["Models"].values())[0]["algorithm"] + else: + algo = config["Architecture"]["algorithm"] + assert ( + algo in supported_list + ), f"algorithms that supports static training must in in {supported_list} but got {algo}" + + specs = [ + InputSpec([None] + config["Global"]["d2s_train_image_shape"], dtype="float32") + ] + + if algo == "SVTR_LCNet": + specs.append( + [ + InputSpec([None, config["Global"]["max_text_length"]], dtype="int64"), + InputSpec([None, config["Global"]["max_text_length"]], dtype="int64"), + InputSpec([None], dtype="int64"), + InputSpec([None], dtype="float64"), + ] + ) + elif algo == "TableMaster": + specs.append( + [ + InputSpec([None, config["Global"]["max_text_length"]], dtype="int64"), + InputSpec( + [None, config["Global"]["max_text_length"], 4], dtype="float32" + ), + InputSpec( + [None, config["Global"]["max_text_length"], 1], dtype="float32" + ), + InputSpec([None, 6], dtype="float32"), + ] + ) + elif algo == "LayoutXLM": + specs = [ + [ + InputSpec(shape=[None, 512], dtype="int64"), # input_ids + InputSpec(shape=[None, 512, 4], dtype="int64"), # bbox + InputSpec(shape=[None, 512], dtype="int64"), # attention_mask + InputSpec(shape=[None, 512], dtype="int64"), # token_type_ids + InputSpec(shape=[None, 3, 224, 224], dtype="float32"), # image + InputSpec(shape=[None, 512], dtype="int64"), # label + ] + ] + elif algo == "SLANet": + specs.append( + [ + InputSpec( + [None, config["Global"]["max_text_length"] + 2], dtype="int64" + ), + InputSpec( + [None, config["Global"]["max_text_length"] + 2, 4], dtype="float32" + ), + InputSpec( + [None, config["Global"]["max_text_length"] + 2, 1], dtype="float32" + ), + InputSpec([None], dtype="int64"), + InputSpec([None, 6], dtype="float64"), + ] + ) + elif algo == "SVTR": + specs.append( + [ + InputSpec([None, config["Global"]["max_text_length"]], dtype="int64"), + InputSpec([None], dtype="int64"), + ] + ) + model = to_static(model, input_spec=specs) + logger.info("Successfully to apply @to_static with specs: {}".format(specs)) + return model diff --git a/docling_ibm_models/slanet_1m/modeling/architectures/base_model.py b/docling_ibm_models/slanet_1m/modeling/architectures/base_model.py new file mode 100644 index 0000000..c1b6116 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/architectures/base_model.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from modeling.backbones import build_backbone +from modeling.necks import build_neck +from modeling.heads import build_head + +__all__ = ["BaseModel"] + + +class BaseModel(nn.Layer): + def __init__(self, config): + """ + the module for OCR. + args: + config (dict): the super parameters for module. + """ + super(BaseModel, self).__init__() + in_channels = config.get("in_channels", 3) + model_type = config["model_type"] + # build transfrom, + # for rec, transfrom can be TPS,None + # for det and cls, transfrom shoule to be None, + # if you make model differently, you can use transfrom in det and cls + # build backbone, backbone is need for del, rec and cls + self.use_transform = False + if "Backbone" not in config or config["Backbone"] is None: + self.use_backbone = False + else: + self.use_backbone = True + config["Backbone"]["in_channels"] = in_channels + self.backbone = build_backbone(config["Backbone"], model_type) + in_channels = self.backbone.out_channels + + # build neck + # for rec, neck can be cnn,rnn or reshape(None) + # for det, neck can be FPN, BIFPN and so on. + # for cls, neck should be none + if "Neck" not in config or config["Neck"] is None: + self.use_neck = False + else: + self.use_neck = True + config["Neck"]["in_channels"] = in_channels + self.neck = build_neck(config["Neck"]) + in_channels = self.neck.out_channels + + # # build head, head is need for det, rec and cls + if "Head" not in config or config["Head"] is None: + self.use_head = False + else: + self.use_head = True + config["Head"]["in_channels"] = in_channels + self.head = build_head(config["Head"]) + + self.return_all_feats = config.get("return_all_feats", False) + + def forward(self, x, data=None): + y = dict() + if self.use_transform: + x = self.transform(x) + if self.use_backbone: + x = self.backbone(x) + if isinstance(x, dict): + y.update(x) + else: + y["backbone_out"] = x + final_name = "backbone_out" + if self.use_neck: + x = self.neck(x) + if isinstance(x, dict): + y.update(x) + else: + y["neck_out"] = x + final_name = "neck_out" + if self.use_head: + x = self.head(x, targets=data) + # for multi head, save ctc neck out for udml + if isinstance(x, dict) and "ctc_neck" in x.keys(): + y["neck_out"] = x["ctc_neck"] + y["head_out"] = x + elif isinstance(x, dict): + y.update(x) + else: + y["head_out"] = x + final_name = "head_out" + if self.return_all_feats: + if self.training: + return y + elif isinstance(x, dict): + return x + else: + return {final_name: x} + else: + return x diff --git a/docling_ibm_models/slanet_1m/modeling/architectures/distillation_model.py b/docling_ibm_models/slanet_1m/modeling/architectures/distillation_model.py new file mode 100644 index 0000000..98912d1 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/architectures/distillation_model.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn +from modeling.backbones import build_backbone +from modeling.necks import build_neck +from modeling.heads import build_head +from .base_model import BaseModel +from paddleocr.ppocr.utils.save_load import load_pretrained_params + +__all__ = ["DistillationModel"] + + +class DistillationModel(nn.Layer): + def __init__(self, config): + """ + the module for OCR distillation. + args: + config (dict): the super parameters for module. + """ + super().__init__() + self.model_list = [] + self.model_name_list = [] + for key in config["Models"]: + model_config = config["Models"][key] + freeze_params = False + pretrained = None + if "freeze_params" in model_config: + freeze_params = model_config.pop("freeze_params") + if "pretrained" in model_config: + pretrained = model_config.pop("pretrained") + model = BaseModel(model_config) + if pretrained is not None: + load_pretrained_params(model, pretrained) + if freeze_params: + for param in model.parameters(): + param.trainable = False + self.model_list.append(self.add_sublayer(key, model)) + self.model_name_list.append(key) + + def forward(self, x, data=None): + result_dict = dict() + for idx, model_name in enumerate(self.model_name_list): + result_dict[model_name] = self.model_list[idx](x, data) + return result_dict diff --git a/docling_ibm_models/slanet_1m/modeling/backbones/__init__.py b/docling_ibm_models/slanet_1m/modeling/backbones/__init__.py new file mode 100644 index 0000000..e91813e --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/backbones/__init__.py @@ -0,0 +1,39 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_backbone"] + + +def build_backbone(config, model_type): + if model_type == "table": + from .det_pp_lcnet import PPLCNet + from .rec_lcnetv3 import PPLCNetV3 + from .det_pp_lcnet_v2 import PPLCNetV2_base + + support_dict = [ + "PPLCNet", + "PPLCNetV3", + "PPLCNetV2_base", + ] + else: + raise NotImplementedError + + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "when model typs is {}, backbone only support {}".format( + model_type, support_dict + ) + ) + module_class = eval(module_name)(**config) + return module_class diff --git a/docling_ibm_models/slanet_1m/modeling/backbones/det_mobilenet_v3.py b/docling_ibm_models/slanet_1m/modeling/backbones/det_mobilenet_v3.py new file mode 100644 index 0000000..98db44b --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/backbones/det_mobilenet_v3.py @@ -0,0 +1,285 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle import ParamAttr + +__all__ = ["MobileNetV3"] + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class MobileNetV3(nn.Layer): + def __init__( + self, in_channels=3, model_name="large", scale=0.5, disable_se=False, **kwargs + ): + """ + the MobilenetV3 backbone network for detection module. + Args: + params(dict): the super parameters for build network + """ + super(MobileNetV3, self).__init__() + + self.disable_se = disable_se + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, "relu", 1], + [3, 64, 24, False, "relu", 2], + [3, 72, 24, False, "relu", 1], + [5, 72, 40, True, "relu", 2], + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], + [3, 240, 80, False, "hardswish", 2], + [3, 200, 80, False, "hardswish", 1], + [3, 184, 80, False, "hardswish", 1], + [3, 184, 80, False, "hardswish", 1], + [3, 480, 112, True, "hardswish", 1], + [3, 672, 112, True, "hardswish", 1], + [5, 672, 160, True, "hardswish", 2], + [5, 960, 160, True, "hardswish", 1], + [5, 960, 160, True, "hardswish", 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, "relu", 2], + [3, 72, 24, False, "relu", 2], + [3, 88, 24, False, "relu", 1], + [5, 96, 40, True, "hardswish", 2], + [5, 240, 40, True, "hardswish", 1], + [5, 240, 40, True, "hardswish", 1], + [5, 120, 48, True, "hardswish", 1], + [5, 144, 48, True, "hardswish", 1], + [5, 288, 96, True, "hardswish", 2], + [5, 576, 96, True, "hardswish", 1], + [5, 576, 96, True, "hardswish", 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError( + "mode[" + model_name + "_model] is not implemented!" + ) + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert ( + scale in supported_scale + ), "supported scale are {} but input scale is {}".format(supported_scale, scale) + inplanes = 16 + # conv1 + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act="hardswish", + ) + + self.stages = [] + self.out_channels = [] + block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for k, exp, c, se, nl, s in cfg: + se = se and not self.disable_se + start_idx = 2 if model_name == "large" else 0 + if s == 2 and i > start_idx: + self.out_channels.append(inplanes) + self.stages.append(nn.Sequential(*block_list)) + block_list = [] + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl, + ) + ) + inplanes = make_divisible(scale * c) + i += 1 + block_list.append( + ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act="hardswish", + ) + ) + self.stages.append(nn.Sequential(*block_list)) + self.out_channels.append(make_divisible(scale * cls_ch_squeeze)) + for i, stage in enumerate(self.stages): + self.add_sublayer(sublayer=stage, name="stage{}".format(i)) + + def forward(self, x): + x = self.conv(x) + out_list = [] + for stage in self.stages: + x = stage(x) + out_list.append(x) + return out_list + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + ): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.act = act + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias_attr=False, + ) + + self.bn = nn.BatchNorm(num_channels=out_channels, act=None) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = F.hardswish(x) + else: + print( + "The activation function({}) is selected incorrectly.".format( + self.act + ) + ) + exit() + return x + + +class ResidualUnit(nn.Layer): + def __init__( + self, + in_channels, + mid_channels, + out_channels, + kernel_size, + stride, + use_se, + act=None, + ): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_channels == out_channels + self.if_se = use_se + + self.expand_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=True, + act=act, + ) + self.bottleneck_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + groups=mid_channels, + if_act=True, + act=act, + ) + if self.if_se: + self.mid_se = SEModule(mid_channels) + self.linear_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + ) + + def forward(self, inputs): + x = self.expand_conv(inputs) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = paddle.add(inputs, x) + return x + + +class SEModule(nn.Layer): + def __init__(self, in_channels, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2D(1) + self.conv1 = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels // reduction, + kernel_size=1, + stride=1, + padding=0, + ) + self.conv2 = nn.Conv2D( + in_channels=in_channels // reduction, + out_channels=in_channels, + kernel_size=1, + stride=1, + padding=0, + ) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5) + return inputs * outputs diff --git a/docling_ibm_models/slanet_1m/modeling/backbones/det_pp_lcnet.py b/docling_ibm_models/slanet_1m/modeling/backbones/det_pp_lcnet.py new file mode 100644 index 0000000..bf557a4 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/backbones/det_pp_lcnet.py @@ -0,0 +1,274 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function + +import os +import paddle +import paddle.nn as nn +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +from paddle.utils.download import get_path_from_url + +MODEL_URLS = { + "PPLCNet_x0.25": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_25_pretrained.pdparams", + "PPLCNet_x0.35": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_35_pretrained.pdparams", + "PPLCNet_x0.5": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_5_pretrained.pdparams", + "PPLCNet_x0.75": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_75_pretrained.pdparams", + "PPLCNet_x1.0": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams", + "PPLCNet_x1.5": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_5_pretrained.pdparams", + "PPLCNet_x2.0": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_0_pretrained.pdparams", + "PPLCNet_x2.5": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_5_pretrained.pdparams", +} + +MODEL_STAGES_PATTERN = { + "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"] +} + +__all__ = list(MODEL_URLS.keys()) + +# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se. +# k: kernel_size +# in_c: input channel number in depthwise block +# out_c: output channel number in depthwise block +# s: stride in depthwise block +# use_se: whether to use SE block + +NET_CONFIG = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": [ + [3, 128, 256, 2, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + ], + "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]], +} + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Layer): + def __init__(self, num_channels, filter_size, num_filters, stride, num_groups=1): + super().__init__() + + self.conv = Conv2D( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=num_groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False, + ) + + self.bn = BatchNorm( + num_filters, + param_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0)), + ) + self.hardswish = nn.Hardswish() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.hardswish(x) + return x + + +class DepthwiseSeparable(nn.Layer): + def __init__(self, num_channels, num_filters, stride, dw_size=3, use_se=False): + super().__init__() + self.use_se = use_se + self.dw_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=num_channels, + filter_size=dw_size, + stride=stride, + num_groups=num_channels, + ) + if use_se: + self.se = SEModule(num_channels) + self.pw_conv = ConvBNLayer( + num_channels=num_channels, filter_size=1, num_filters=num_filters, stride=1 + ) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + ) + self.relu = nn.ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + ) + self.hardsigmoid = nn.Hardsigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +class PPLCNet(nn.Layer): + def __init__(self, in_channels=3, scale=1.0, pretrained=False, use_ssld=False): + super().__init__() + self.out_channels = [ + int(NET_CONFIG["blocks3"][-1][2] * scale), + int(NET_CONFIG["blocks4"][-1][2] * scale), + int(NET_CONFIG["blocks5"][-1][2] * scale), + int(NET_CONFIG["blocks6"][-1][2] * scale), + ] + self.scale = scale + + self.conv1 = ConvBNLayer( + num_channels=in_channels, + filter_size=3, + num_filters=make_divisible(16 * scale), + stride=2, + ) + + self.blocks2 = nn.Sequential( + *[ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + ) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) + ] + ) + + self.blocks3 = nn.Sequential( + *[ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + ) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) + ] + ) + + self.blocks4 = nn.Sequential( + *[ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + ) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) + ] + ) + + self.blocks5 = nn.Sequential( + *[ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + ) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) + ] + ) + + self.blocks6 = nn.Sequential( + *[ + DepthwiseSeparable( + num_channels=make_divisible(in_c * scale), + num_filters=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + ) + for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) + ] + ) + + if pretrained: + self._load_pretrained( + MODEL_URLS["PPLCNet_x{}".format(scale)], use_ssld=use_ssld + ) + + def forward(self, x): + outs = [] + x = self.conv1(x) + x = self.blocks2(x) + x = self.blocks3(x) + outs.append(x) + x = self.blocks4(x) + outs.append(x) + x = self.blocks5(x) + outs.append(x) + x = self.blocks6(x) + outs.append(x) + return outs + + def _load_pretrained(self, pretrained_url, use_ssld=False): + if use_ssld: + pretrained_url = pretrained_url.replace("_pretrained", "_ssld_pretrained") + print(pretrained_url) + local_weight_path = get_path_from_url( + pretrained_url, os.path.expanduser("~/.paddleclas/weights") + ) + param_state_dict = paddle.load(local_weight_path) + self.set_dict(param_state_dict) + return diff --git a/docling_ibm_models/slanet_1m/modeling/backbones/det_pp_lcnet_v2.py b/docling_ibm_models/slanet_1m/modeling/backbones/det_pp_lcnet_v2.py new file mode 100644 index 0000000..5b5a568 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/backbones/det_pp_lcnet_v2.py @@ -0,0 +1,358 @@ +# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function +import os + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Dropout, Linear +from paddle.regularizer import L2Decay +from paddle.nn.initializer import KaimingNormal +from paddle.utils.download import get_path_from_url + +MODEL_URLS = { + "PPLCNetV2_small": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_small_ssld_pretrained.pdparams", + "PPLCNetV2_base": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_ssld_pretrained.pdparams", + "PPLCNetV2_large": "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_large_ssld_pretrained.pdparams", +} + +__all__ = list(MODEL_URLS.keys()) + +NET_CONFIG = { + # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut + "stage1": [64, 3, False, False, False, False], + "stage2": [128, 3, False, False, False, False], + "stage3": [256, 5, True, True, True, False], + "stage4": [512, 5, False, True, False, True], +} + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Layer): + def __init__( + self, in_channels, out_channels, kernel_size, stride, groups=1, use_act=True + ): + super().__init__() + self.use_act = use_act + self.conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(initializer=KaimingNormal()), + bias_attr=False, + ) + + self.bn = BatchNorm2D( + out_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.0)), + bias_attr=ParamAttr(regularizer=L2Decay(0.0)), + ) + if self.use_act: + self.act = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + return x + + +class SEModule(nn.Layer): + def __init__(self, channel, reduction=4): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + ) + self.relu = nn.ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + ) + self.hardsigmoid = nn.Sigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +class RepDepthwiseSeparable(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride, + dw_size=3, + split_pw=False, + use_rep=False, + use_se=False, + use_shortcut=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.is_repped = False + + self.dw_size = dw_size + self.split_pw = split_pw + self.use_rep = use_rep + self.use_se = use_se + self.use_shortcut = ( + True + if use_shortcut and stride == 1 and in_channels == out_channels + else False + ) + + if self.use_rep: + self.dw_conv_list = nn.LayerList() + for kernel_size in range(self.dw_size, 0, -2): + if kernel_size == 1 and stride != 1: + continue + dw_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=stride, + groups=in_channels, + use_act=False, + ) + self.dw_conv_list.append(dw_conv) + self.dw_conv = nn.Conv2D( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + padding=(dw_size - 1) // 2, + groups=in_channels, + ) + else: + self.dw_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + groups=in_channels, + ) + + self.act = nn.ReLU() + + if use_se: + self.se = SEModule(in_channels) + + if self.split_pw: + pw_ratio = 0.5 + self.pw_conv_1 = ConvBNLayer( + in_channels=in_channels, + kernel_size=1, + out_channels=int(out_channels * pw_ratio), + stride=1, + ) + self.pw_conv_2 = ConvBNLayer( + in_channels=int(out_channels * pw_ratio), + kernel_size=1, + out_channels=out_channels, + stride=1, + ) + else: + self.pw_conv = ConvBNLayer( + in_channels=in_channels, + kernel_size=1, + out_channels=out_channels, + stride=1, + ) + + def forward(self, x): + if self.use_rep: + input_x = x + if self.is_repped: + x = self.act(self.dw_conv(x)) + else: + y = self.dw_conv_list[0](x) + for dw_conv in self.dw_conv_list[1:]: + y += dw_conv(x) + x = self.act(y) + else: + x = self.dw_conv(x) + + if self.use_se: + x = self.se(x) + if self.split_pw: + x = self.pw_conv_1(x) + x = self.pw_conv_2(x) + else: + x = self.pw_conv(x) + if self.use_shortcut: + x = x + input_x + return x + + def re_parameterize(self): + if self.use_rep: + self.is_repped = True + kernel, bias = self._get_equivalent_kernel_bias() + self.dw_conv.weight.set_value(kernel) + self.dw_conv.bias.set_value(bias) + + def _get_equivalent_kernel_bias(self): + kernel_sum = 0 + bias_sum = 0 + for dw_conv in self.dw_conv_list: + kernel, bias = self._fuse_bn_tensor(dw_conv) + kernel = self._pad_tensor(kernel, to_size=self.dw_size) + kernel_sum += kernel + bias_sum += bias + return kernel_sum, bias_sum + + def _fuse_bn_tensor(self, branch): + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + def _pad_tensor(self, tensor, to_size): + from_size = tensor.shape[-1] + if from_size == to_size: + return tensor + pad = (to_size - from_size) // 2 + return F.pad(tensor, [pad, pad, pad, pad]) + + +class PPLCNetV2(nn.Layer): + def __init__(self, scale, depths, out_indx=[1, 2, 3, 4], **kwargs): + super().__init__(**kwargs) + self.scale = scale + self.out_channels = [ + # int(NET_CONFIG["blocks3"][-1][2] * scale), + int(NET_CONFIG["stage1"][0] * scale * 2), + int(NET_CONFIG["stage2"][0] * scale * 2), + int(NET_CONFIG["stage3"][0] * scale * 2), + int(NET_CONFIG["stage4"][0] * scale * 2), + ] + self.stem = nn.Sequential( + *[ + ConvBNLayer( + in_channels=3, + kernel_size=3, + out_channels=make_divisible(32 * scale), + stride=2, + ), + RepDepthwiseSeparable( + in_channels=make_divisible(32 * scale), + out_channels=make_divisible(64 * scale), + stride=1, + dw_size=3, + ), + ] + ) + self.out_indx = out_indx + # stages + self.stages = nn.LayerList() + for depth_idx, k in enumerate(NET_CONFIG): + ( + in_channels, + kernel_size, + split_pw, + use_rep, + use_se, + use_shortcut, + ) = NET_CONFIG[k] + self.stages.append( + nn.Sequential( + *[ + RepDepthwiseSeparable( + in_channels=make_divisible( + (in_channels if i == 0 else in_channels * 2) * scale + ), + out_channels=make_divisible(in_channels * 2 * scale), + stride=2 if i == 0 else 1, + dw_size=kernel_size, + split_pw=split_pw, + use_rep=use_rep, + use_se=use_se, + use_shortcut=use_shortcut, + ) + for i in range(depths[depth_idx]) + ] + ) + ) + + # if pretrained: + self._load_pretrained(MODEL_URLS["PPLCNetV2_base"], use_ssld=True) + + def forward(self, x): + x = self.stem(x) + i = 1 + outs = [] + for stage in self.stages: + x = stage(x) + if i in self.out_indx: + outs.append(x) + i += 1 + return outs + + def _load_pretrained(self, pretrained_url, use_ssld=False): + print(pretrained_url) + local_weight_path = get_path_from_url( + pretrained_url, os.path.expanduser("~/.paddleclas/weights") + ) + param_state_dict = paddle.load(local_weight_path) + self.set_dict(param_state_dict) + print("load pretrain ssd success!") + return + + +def PPLCNetV2_base(in_channels=3, **kwargs): + """ + PPLCNetV2_base + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPLCNetV2_base` model depends on args. + """ + model = PPLCNetV2(scale=1.0, depths=[2, 2, 6, 2], **kwargs) + return model diff --git a/docling_ibm_models/slanet_1m/modeling/backbones/rec_lcnetv3.py b/docling_ibm_models/slanet_1m/modeling/backbones/rec_lcnetv3.py new file mode 100644 index 0000000..b54670c --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/backbones/rec_lcnetv3.py @@ -0,0 +1,554 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr +from paddle.nn.initializer import Constant, KaimingNormal +from paddle.nn import ( + AdaptiveAvgPool2D, + BatchNorm2D, + Conv2D, + Dropout, + Hardsigmoid, + Hardswish, + Identity, + Linear, + ReLU, +) +from paddle.regularizer import L2Decay + +NET_CONFIG_det = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": [ + [3, 128, 256, 2, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + ], + "blocks6": [ + [5, 256, 512, 2, True], + [5, 512, 512, 1, True], + [5, 512, 512, 1, False], + [5, 512, 512, 1, False], + ], +} + +NET_CONFIG_rec = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]], + "blocks5": [ + [3, 128, 256, (1, 2), False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + ], + "blocks6": [ + [5, 256, 512, (2, 1), True], + [5, 512, 512, 1, True], + [5, 512, 512, (2, 1), False], + [5, 512, 512, 1, False], + ], +} + + +def make_divisible(v, divisor=16, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class LearnableAffineBlock(nn.Layer): + def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.1): + super().__init__() + self.scale = self.create_parameter( + shape=[ + 1, + ], + default_initializer=Constant(value=scale_value), + attr=ParamAttr(learning_rate=lr_mult * lab_lr), + ) + self.add_parameter("scale", self.scale) + self.bias = self.create_parameter( + shape=[ + 1, + ], + default_initializer=Constant(value=bias_value), + attr=ParamAttr(learning_rate=lr_mult * lab_lr), + ) + self.add_parameter("bias", self.bias) + + def forward(self, x): + return self.scale * x + self.bias + + +class ConvBNLayer(nn.Layer): + def __init__( + self, in_channels, out_channels, kernel_size, stride, groups=1, lr_mult=1.0 + ): + super().__init__() + self.conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(initializer=KaimingNormal(), learning_rate=lr_mult), + bias_attr=False, + ) + + self.bn = BatchNorm2D( + out_channels, + weight_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult), + bias_attr=ParamAttr(regularizer=L2Decay(0.0), learning_rate=lr_mult), + ) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class Act(nn.Layer): + def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1): + super().__init__() + if act == "hswish": + self.act = Hardswish() + else: + assert act == "relu" + self.act = ReLU() + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + return self.lab(self.act(x)) + + +class LearnableRepLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + num_conv_branches=1, + lr_mult=1.0, + lab_lr=0.1, + ): + super().__init__() + self.is_repped = False + self.groups = groups + self.stride = stride + self.kernel_size = kernel_size + self.in_channels = in_channels + self.out_channels = out_channels + self.num_conv_branches = num_conv_branches + self.padding = (kernel_size - 1) // 2 + + self.identity = ( + BatchNorm2D( + num_features=in_channels, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult), + ) + if out_channels == in_channels and stride == 1 + else None + ) + + self.conv_kxk = nn.LayerList( + [ + ConvBNLayer( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + lr_mult=lr_mult, + ) + for _ in range(self.num_conv_branches) + ] + ) + + self.conv_1x1 = ( + ConvBNLayer( + in_channels, out_channels, 1, stride, groups=groups, lr_mult=lr_mult + ) + if kernel_size > 1 + else None + ) + + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + # for export + if self.is_repped: + out = self.lab(self.reparam_conv(x)) + if self.stride != 2: + out = self.act(out) + return out + + out = 0 + if self.identity is not None: + out += self.identity(x) + + if self.conv_1x1 is not None: + out += self.conv_1x1(x) + + for conv in self.conv_kxk: + out += conv(x) + + out = self.lab(out) + if self.stride != 2: + out = self.act(out) + return out + + def rep(self): + if self.is_repped: + return + kernel, bias = self._get_kernel_bias() + self.reparam_conv = Conv2D( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + groups=self.groups, + ) + self.reparam_conv.weight.set_value(kernel) + self.reparam_conv.bias.set_value(bias) + self.is_repped = True + + def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad): + if not isinstance(kernel1x1, paddle.Tensor): + return 0 + else: + return nn.functional.pad(kernel1x1, [pad, pad, pad, pad]) + + def _get_kernel_bias(self): + kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1) + kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk( + kernel_conv_1x1, self.kernel_size // 2 + ) + + kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity) + + kernel_conv_kxk = 0 + bias_conv_kxk = 0 + for conv in self.conv_kxk: + kernel, bias = self._fuse_bn_tensor(conv) + kernel_conv_kxk += kernel + bias_conv_kxk += bias + + kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity + bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity + return kernel_reparam, bias_reparam + + def _fuse_bn_tensor(self, branch): + if not branch: + return 0, 0 + elif isinstance(branch, ConvBNLayer): + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + else: + assert isinstance(branch, BatchNorm2D) + if not hasattr(self, "id_tensor"): + input_dim = self.in_channels // self.groups + kernel_value = paddle.zeros( + (self.in_channels, input_dim, self.kernel_size, self.kernel_size), + dtype=branch.weight.dtype, + ) + for i in range(self.in_channels): + kernel_value[ + i, i % input_dim, self.kernel_size // 2, self.kernel_size // 2 + ] = 1 + self.id_tensor = kernel_value + kernel = self.id_tensor + running_mean = branch._mean + running_var = branch._variance + gamma = branch.weight + beta = branch.bias + eps = branch._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + +class SELayer(nn.Layer): + def __init__(self, channel, reduction=4, lr_mult=1.0): + super().__init__() + self.avg_pool = AdaptiveAvgPool2D(1) + self.conv1 = Conv2D( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult), + ) + self.relu = ReLU() + self.conv2 = Conv2D( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + weight_attr=ParamAttr(learning_rate=lr_mult), + bias_attr=ParamAttr(learning_rate=lr_mult), + ) + self.hardsigmoid = Hardsigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = paddle.multiply(x=identity, y=x) + return x + + +class LCNetV3Block(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + stride, + dw_size, + use_se=False, + conv_kxk_num=4, + lr_mult=1.0, + lab_lr=0.1, + ): + super().__init__() + self.use_se = use_se + self.dw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + groups=in_channels, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr, + ) + if use_se: + self.se = SELayer(in_channels, lr_mult=lr_mult) + self.pw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr, + ) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class PPLCNetV3(nn.Layer): + def __init__( + self, + scale=1.0, + conv_kxk_num=4, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + lab_lr=0.1, + det=False, + **kwargs, + ): + super().__init__() + self.scale = scale + self.lr_mult_list = lr_mult_list + self.det = det + + self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec + + assert isinstance( + self.lr_mult_list, (list, tuple) + ), "lr_mult_list should be in (list, tuple) but got {}".format( + type(self.lr_mult_list) + ) + assert ( + len(self.lr_mult_list) == 6 + ), "lr_mult_list length should be 6 but got {}".format(len(self.lr_mult_list)) + + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=make_divisible(16 * scale), + kernel_size=3, + stride=2, + lr_mult=self.lr_mult_list[0], + ) + + self.blocks2 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[1], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks2"]) + ] + ) + + self.blocks3 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[2], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks3"]) + ] + ) + + self.blocks4 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[3], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks4"]) + ] + ) + + self.blocks5 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[4], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks5"]) + ] + ) + + self.blocks6 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[5], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks6"]) + ] + ) + self.out_channels = make_divisible(512 * scale) + + if self.det: + mv_c = [16, 24, 56, 480] + self.out_channels = [ + make_divisible(self.net_config["blocks3"][-1][2] * scale), + make_divisible(self.net_config["blocks4"][-1][2] * scale), + make_divisible(self.net_config["blocks5"][-1][2] * scale), + make_divisible(self.net_config["blocks6"][-1][2] * scale), + ] + + self.layer_list = nn.LayerList( + [ + nn.Conv2D(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0), + nn.Conv2D(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0), + nn.Conv2D(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0), + nn.Conv2D(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0), + ] + ) + self.out_channels = [ + int(mv_c[0] * scale), + int(mv_c[1] * scale), + int(mv_c[2] * scale), + int(mv_c[3] * scale), + ] + + def forward(self, x): + out_list = [] + x = self.conv1(x) + + x = self.blocks2(x) + x = self.blocks3(x) + out_list.append(x) + x = self.blocks4(x) + out_list.append(x) + x = self.blocks5(x) + out_list.append(x) + x = self.blocks6(x) + out_list.append(x) + + if self.det: + out_list[0] = self.layer_list[0](out_list[0]) + out_list[1] = self.layer_list[1](out_list[1]) + out_list[2] = self.layer_list[2](out_list[2]) + out_list[3] = self.layer_list[3](out_list[3]) + return out_list + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x diff --git a/docling_ibm_models/slanet_1m/modeling/backbones/rec_resnet_fpn.py b/docling_ibm_models/slanet_1m/modeling/backbones/rec_resnet_fpn.py new file mode 100644 index 0000000..d259f1d --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/backbones/rec_resnet_fpn.py @@ -0,0 +1,317 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from paddle import nn, ParamAttr +from paddle.nn import functional as F +import paddle +import numpy as np + +__all__ = ["ResNetFPN"] + + +class ResNetFPN(nn.Layer): + def __init__(self, in_channels=1, layers=50, **kwargs): + super(ResNetFPN, self).__init__() + supported_layers = { + 18: {"depth": [2, 2, 2, 2], "block_class": BasicBlock}, + 34: {"depth": [3, 4, 6, 3], "block_class": BasicBlock}, + 50: {"depth": [3, 4, 6, 3], "block_class": BottleneckBlock}, + 101: {"depth": [3, 4, 23, 3], "block_class": BottleneckBlock}, + 152: {"depth": [3, 8, 36, 3], "block_class": BottleneckBlock}, + } + stride_list = [(2, 2), (2, 2), (1, 1), (1, 1)] + num_filters = [64, 128, 256, 512] + self.depth = supported_layers[layers]["depth"] + self.F = [] + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=64, + kernel_size=7, + stride=2, + act="relu", + name="conv1", + ) + self.block_list = [] + in_ch = 64 + if layers >= 50: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + block_list = self.add_sublayer( + "bottleneckBlock_{}_{}".format(block, i), + BottleneckBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + name=conv_name, + ), + ) + in_ch = num_filters[block] * 4 + self.block_list.append(block_list) + self.F.append(block_list) + else: + for block in range(len(self.depth)): + for i in range(self.depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + if i == 0 and block != 0: + stride = (2, 1) + else: + stride = (1, 1) + basic_block = self.add_sublayer( + conv_name, + BasicBlock( + in_channels=in_ch, + out_channels=num_filters[block], + stride=stride_list[block] if i == 0 else 1, + is_first=block == i == 0, + name=conv_name, + ), + ) + in_ch = basic_block.out_channels + self.block_list.append(basic_block) + out_ch_list = [in_ch // 4, in_ch // 2, in_ch] + self.base_block = [] + self.conv_trans = [] + self.bn_block = [] + for i in [-2, -3]: + in_channels = out_ch_list[i + 1] + out_ch_list[i] + + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_0".format(i), + nn.Conv2D( + in_channels=in_channels, + out_channels=out_ch_list[i], + kernel_size=1, + weight_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True), + ), + ) + ) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_1".format(i), + nn.Conv2D( + in_channels=out_ch_list[i], + out_channels=out_ch_list[i], + kernel_size=3, + padding=1, + weight_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True), + ), + ) + ) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_2".format(i), + nn.BatchNorm( + num_channels=out_ch_list[i], + act="relu", + param_attr=ParamAttr(trainable=True), + bias_attr=ParamAttr(trainable=True), + ), + ) + ) + self.base_block.append( + self.add_sublayer( + "F_{}_base_block_3".format(i), + nn.Conv2D( + in_channels=out_ch_list[i], + out_channels=512, + kernel_size=1, + bias_attr=ParamAttr(trainable=True), + weight_attr=ParamAttr(trainable=True), + ), + ) + ) + self.out_channels = 512 + + def __call__(self, x): + x = self.conv(x) + fpn_list = [] + F = [] + for i in range(len(self.depth)): + fpn_list.append(np.sum(self.depth[: i + 1])) + + for i, block in enumerate(self.block_list): + x = block(x) + for number in fpn_list: + if i + 1 == number: + F.append(x) + base = F[-1] + + j = 0 + for i, block in enumerate(self.base_block): + if i % 3 == 0 and i < 6: + j = j + 1 + b, c, w, h = F[-j - 1].shape + if [w, h] == list(base.shape[2:]): + base = base + else: + base = self.conv_trans[j - 1](base) + base = self.bn_block[j - 1](base) + base = paddle.concat([base, F[-j - 1]], axis=1) + base = block(base) + return base + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + act=None, + name=None, + ): + super(ConvBNLayer, self).__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=2 if stride == (1, 1) else kernel_size, + dilation=2 if stride == (1, 1) else 1, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + weight_attr=ParamAttr(name=name + ".conv2d.output.1.w_0"), + bias_attr=False, + ) + + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + self.bn = nn.BatchNorm( + num_channels=out_channels, + act=act, + param_attr=ParamAttr(name=name + ".output.1.w_0"), + bias_attr=ParamAttr(name=name + ".output.1.b_0"), + moving_mean_name=bn_name + "_mean", + moving_variance_name=bn_name + "_variance", + ) + + def __call__(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class ShortCut(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name, is_first=False): + super(ShortCut, self).__init__() + self.use_conv = True + + if in_channels != out_channels or stride != 1 or is_first == True: + if stride == (1, 1): + self.conv = ConvBNLayer(in_channels, out_channels, 1, 1, name=name) + else: # stride==(2,2) + self.conv = ConvBNLayer(in_channels, out_channels, 1, stride, name=name) + else: + self.use_conv = False + + def forward(self, x): + if self.use_conv: + x = self.conv(x) + return x + + +class BottleneckBlock(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name): + super(BottleneckBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act="relu", + name=name + "_branch2a", + ) + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act="relu", + name=name + "_branch2b", + ) + + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c", + ) + + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels * 4, + stride=stride, + is_first=False, + name=name + "_branch1", + ) + self.out_channels = out_channels * 4 + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = self.conv2(y) + y = y + self.short(x) + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, in_channels, out_channels, stride, name, is_first): + super(BasicBlock, self).__init__() + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + act="relu", + stride=stride, + name=name + "_branch2a", + ) + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b", + ) + self.short = ShortCut( + in_channels=in_channels, + out_channels=out_channels, + stride=stride, + is_first=is_first, + name=name + "_branch1", + ) + self.out_channels = out_channels + + def forward(self, x): + y = self.conv0(x) + y = self.conv1(y) + y = y + self.short(x) + return F.relu(y) diff --git a/docling_ibm_models/slanet_1m/modeling/backbones/rec_svtrnet.py b/docling_ibm_models/slanet_1m/modeling/backbones/rec_svtrnet.py new file mode 100644 index 0000000..427c87b --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/backbones/rec_svtrnet.py @@ -0,0 +1,642 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import ParamAttr +from paddle.nn.initializer import KaimingNormal +import numpy as np +import paddle +import paddle.nn as nn +from paddle.nn.initializer import TruncatedNormal, Constant, Normal + +trunc_normal_ = TruncatedNormal(std=0.02) +normal_ = Normal +zeros_ = Constant(value=0.0) +ones_ = Constant(value=1.0) + + +def drop_path(x, drop_prob=0.0, training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + random_tensor = paddle.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act=nn.GELU, + ): + super().__init__() + self.conv = nn.Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()), + bias_attr=bias_attr, + ) + self.norm = nn.BatchNorm2D(out_channels) + self.act = act() + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class DropPath(nn.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Layer): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Layer): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class ConvMixer(nn.Layer): + def __init__( + self, + dim, + num_heads=8, + HW=[8, 25], + local_k=[3, 3], + ): + super().__init__() + self.HW = HW + self.dim = dim + self.local_mixer = nn.Conv2D( + dim, + dim, + local_k, + 1, + [local_k[0] // 2, local_k[1] // 2], + groups=num_heads, + weight_attr=ParamAttr(initializer=KaimingNormal()), + ) + + def forward(self, x): + h = self.HW[0] + w = self.HW[1] + x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w]) + x = self.local_mixer(x) + x = x.flatten(2).transpose([0, 2, 1]) + return x + + +class Attention(nn.Layer): + def __init__( + self, + dim, + num_heads=8, + mixer="Global", + HW=None, + local_k=[7, 11], + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.num_heads = num_heads + self.dim = dim + self.head_dim = dim // num_heads + self.scale = qk_scale or self.head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.HW = HW + if HW is not None: + H = HW[0] + W = HW[1] + self.N = H * W + self.C = dim + if mixer == "Local" and HW is not None: + hk = local_k[0] + wk = local_k[1] + mask = paddle.ones([H * W, H + hk - 1, W + wk - 1], dtype="float32") + for h in range(0, H): + for w in range(0, W): + mask[h * W + w, h : h + hk, w : w + wk] = 0.0 + mask_paddle = mask[:, hk // 2 : H + hk // 2, wk // 2 : W + wk // 2].flatten( + 1 + ) + mask_inf = paddle.full([H * W, H * W], "-inf", dtype="float32") + mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf) + self.mask = mask.unsqueeze([0, 1]) + self.mixer = mixer + + def forward(self, x): + qkv = ( + self.qkv(x) + .reshape((0, -1, 3, self.num_heads, self.head_dim)) + .transpose((2, 0, 3, 1, 4)) + ) + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + + attn = q.matmul(k.transpose((0, 1, 3, 2))) + if self.mixer == "Local": + attn += self.mask + attn = nn.functional.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Layer): + def __init__( + self, + dim, + num_heads, + mixer="Global", + local_mixer=[7, 11], + HW=None, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer="nn.LayerNorm", + epsilon=1e-6, + prenorm=True, + ): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) + else: + self.norm1 = norm_layer(dim) + if mixer == "Global" or mixer == "Local": + self.mixer = Attention( + dim, + num_heads=num_heads, + mixer=mixer, + HW=HW, + local_k=local_mixer, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + elif mixer == "Conv": + self.mixer = ConvMixer(dim, num_heads=num_heads, HW=HW, local_k=local_mixer) + else: + raise TypeError("The mixer must be one of [Global, Local, Conv]") + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) + else: + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp_ratio = mlp_ratio + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + self.prenorm = prenorm + + def forward(self, x): + if self.prenorm: + x = self.norm1(x + self.drop_path(self.mixer(x))) + x = self.norm2(x + self.drop_path(self.mlp(x))) + else: + x = x + self.drop_path(self.mixer(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Layer): + """Image to Patch Embedding""" + + def __init__( + self, + img_size=[32, 100], + in_channels=3, + embed_dim=768, + sub_num=2, + patch_size=[4, 4], + mode="pope", + ): + super().__init__() + num_patches = (img_size[1] // (2**sub_num)) * (img_size[0] // (2**sub_num)) + self.img_size = img_size + self.num_patches = num_patches + self.embed_dim = embed_dim + self.norm = None + if mode == "pope": + if sub_num == 2: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None, + ), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None, + ), + ) + if sub_num == 3: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 4, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None, + ), + ConvBNLayer( + in_channels=embed_dim // 4, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None, + ), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=None, + ), + ) + elif mode == "linear": + self.proj = nn.Conv2D( + 1, embed_dim, kernel_size=patch_size, stride=patch_size + ) + self.num_patches = ( + img_size[0] // patch_size[0] * img_size[1] // patch_size[1] + ) + + def forward(self, x): + B, C, H, W = x.shape + assert ( + H == self.img_size[0] and W == self.img_size[1] + ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose((0, 2, 1)) + return x + + +class SubSample(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + types="Pool", + stride=[2, 1], + sub_norm="nn.LayerNorm", + act=None, + ): + super().__init__() + self.types = types + if types == "Pool": + self.avgpool = nn.AvgPool2D( + kernel_size=[3, 5], stride=stride, padding=[1, 2] + ) + self.maxpool = nn.MaxPool2D( + kernel_size=[3, 5], stride=stride, padding=[1, 2] + ) + self.proj = nn.Linear(in_channels, out_channels) + else: + self.conv = nn.Conv2D( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + weight_attr=ParamAttr(initializer=KaimingNormal()), + ) + self.norm = eval(sub_norm)(out_channels) + if act is not None: + self.act = act() + else: + self.act = None + + def forward(self, x): + if self.types == "Pool": + x1 = self.avgpool(x) + x2 = self.maxpool(x) + x = (x1 + x2) * 0.5 + out = self.proj(x.flatten(2).transpose((0, 2, 1))) + else: + x = self.conv(x) + out = x.flatten(2).transpose((0, 2, 1)) + out = self.norm(out) + if self.act is not None: + out = self.act(out) + + return out + + +class SVTRNet(nn.Layer): + def __init__( + self, + img_size=[32, 100], + in_channels=3, + embed_dim=[64, 128, 256], + depth=[3, 6, 3], + num_heads=[2, 4, 8], + mixer=["Local"] * 6 + ["Global"] * 6, # Local atten, Global atten, Conv + local_mixer=[[7, 11], [7, 11], [7, 11]], + patch_merging="Conv", # Conv, Pool, None + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + last_drop=0.1, + attn_drop_rate=0.0, + drop_path_rate=0.1, + norm_layer="nn.LayerNorm", + sub_norm="nn.LayerNorm", + epsilon=1e-6, + out_channels=192, + out_char_num=25, + block_unit="Block", + act="nn.GELU", + last_stage=True, + sub_num=2, + prenorm=True, + use_lenhead=False, + **kwargs, + ): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.out_channels = out_channels + self.prenorm = prenorm + patch_merging = ( + None + if patch_merging != "Conv" and patch_merging != "Pool" + else patch_merging + ) + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim[0], + sub_num=sub_num, + ) + num_patches = self.patch_embed.num_patches + self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)] + self.pos_embed = self.create_parameter( + shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_ + ) + self.add_parameter("pos_embed", self.pos_embed) + self.pos_drop = nn.Dropout(p=drop_rate) + Block_unit = eval(block_unit) + + dpr = np.linspace(0, drop_path_rate, sum(depth)) + self.blocks1 = nn.LayerList( + [ + Block_unit( + dim=embed_dim[0], + num_heads=num_heads[0], + mixer=mixer[0 : depth[0]][i], + HW=self.HW, + local_mixer=local_mixer[0], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[0 : depth[0]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm, + ) + for i in range(depth[0]) + ] + ) + if patch_merging is not None: + self.sub_sample1 = SubSample( + embed_dim[0], + embed_dim[1], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging, + ) + HW = [self.HW[0] // 2, self.HW[1]] + else: + HW = self.HW + self.patch_merging = patch_merging + self.blocks2 = nn.LayerList( + [ + Block_unit( + dim=embed_dim[1], + num_heads=num_heads[1], + mixer=mixer[depth[0] : depth[0] + depth[1]][i], + HW=HW, + local_mixer=local_mixer[1], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] : depth[0] + depth[1]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm, + ) + for i in range(depth[1]) + ] + ) + if patch_merging is not None: + self.sub_sample2 = SubSample( + embed_dim[1], + embed_dim[2], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging, + ) + HW = [self.HW[0] // 4, self.HW[1]] + else: + HW = self.HW + self.blocks3 = nn.LayerList( + [ + Block_unit( + dim=embed_dim[2], + num_heads=num_heads[2], + mixer=mixer[depth[0] + depth[1] :][i], + HW=HW, + local_mixer=local_mixer[2], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] + depth[1] :][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm, + ) + for i in range(depth[2]) + ] + ) + self.last_stage = last_stage + if last_stage: + self.avg_pool = nn.AdaptiveAvgPool2D([1, out_char_num]) + self.last_conv = nn.Conv2D( + in_channels=embed_dim[2], + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0, + bias_attr=False, + ) + self.hardswish = nn.Hardswish() + self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer") + if not prenorm: + self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon) + self.use_lenhead = use_lenhead + if use_lenhead: + self.len_conv = nn.Linear(embed_dim[2], self.out_channels) + self.hardswish_len = nn.Hardswish() + self.dropout_len = nn.Dropout(p=last_drop, mode="downscale_in_infer") + + trunc_normal_(self.pos_embed) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward_features(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample1( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[0], self.HW[0], self.HW[1]] + ) + ) + for blk in self.blocks2: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample2( + x.transpose([0, 2, 1]).reshape( + [0, self.embed_dim[1], self.HW[0] // 2, self.HW[1]] + ) + ) + for blk in self.blocks3: + x = blk(x) + if not self.prenorm: + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + if self.use_lenhead: + len_x = self.len_conv(x.mean(1)) + len_x = self.dropout_len(self.hardswish_len(len_x)) + if self.last_stage: + if self.patch_merging is not None: + h = self.HW[0] // 4 + else: + h = self.HW[0] + x = self.avg_pool( + x.transpose([0, 2, 1]).reshape([0, self.embed_dim[2], h, self.HW[1]]) + ) + x = self.last_conv(x) + x = self.hardswish(x) + x = self.dropout(x) + if self.use_lenhead: + return x, len_x + return x diff --git a/docling_ibm_models/slanet_1m/modeling/heads/__init__.py b/docling_ibm_models/slanet_1m/modeling/heads/__init__.py new file mode 100644 index 0000000..829728f --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/heads/__init__.py @@ -0,0 +1,40 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_head"] + + +def build_head(config): + + # rec head + from .rec_ctc_head import CTCHead + from .rec_att_head import AttentionHead + from .rec_nrtr_head import Transformer + from .rec_multi_head import MultiHead + + + from .table_att_head import TableAttentionHead, SLAHead + + support_dict = [ + "SLAHead", + ] + + # table head + + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "head only support {}".format(support_dict) + ) + module_class = eval(module_name)(**config) + return module_class diff --git a/docling_ibm_models/slanet_1m/modeling/heads/rec_att_head.py b/docling_ibm_models/slanet_1m/modeling/heads/rec_att_head.py new file mode 100644 index 0000000..2c952ce --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/heads/rec_att_head.py @@ -0,0 +1,215 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np + + +class AttentionHead(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionHead, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionGRUCell( + in_channels, hidden_size, out_channels, use_gru=False + ) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = paddle.zeros((batch_size, self.hidden_size)) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes + ) + (outputs, hidden), alpha = self.attention_cell( + hidden, inputs, char_onehots + ) + output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + char_onehots = None + outputs = None + alpha = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes + ) + (outputs, hidden), alpha = self.attention_cell( + hidden, inputs, char_onehots + ) + probs_step = self.generator(outputs) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze(probs_step, axis=1)], axis=1 + ) + next_input = probs_step.argmax(axis=1) + targets = next_input + if not self.training: + probs = paddle.nn.functional.softmax(probs, axis=2) + return probs + + +class AttentionGRUCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionGRUCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size + ) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1) + + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha + + +class AttentionLSTM(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionLSTM, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False + ) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = ( + paddle.zeros((batch_size, self.hidden_size)), + paddle.zeros((batch_size, self.hidden_size)), + ) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + # one-hot vectors for a i-th char + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes + ) + hidden, alpha = self.attention_cell(hidden, inputs, char_onehots) + + hidden = (hidden[1][0], hidden[1][1]) + output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + char_onehots = None + alpha = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes + ) + hidden, alpha = self.attention_cell(hidden, inputs, char_onehots) + probs_step = self.generator(hidden[0]) + hidden = (hidden[1][0], hidden[1][1]) + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( + [probs, paddle.unsqueeze(probs_step, axis=1)], axis=1 + ) + + next_input = probs_step.argmax(axis=1) + + targets = next_input + if not self.training: + probs = paddle.nn.functional.softmax(probs, axis=2) + return probs + + +class AttentionLSTMCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size + ) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size + ) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1) + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha diff --git a/docling_ibm_models/slanet_1m/modeling/heads/rec_ctc_head.py b/docling_ibm_models/slanet_1m/modeling/heads/rec_ctc_head.py new file mode 100644 index 0000000..5e19a9a --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/heads/rec_ctc_head.py @@ -0,0 +1,92 @@ +# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +from paddle import ParamAttr, nn +from paddle.nn import functional as F + + +def get_para_bias_attr(l2_decay, k): + regularizer = paddle.regularizer.L2Decay(l2_decay) + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = nn.initializer.Uniform(-stdv, stdv) + weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + return [weight_attr, bias_attr] + + +class CTCHead(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + fc_decay=0.0004, + mid_channels=None, + return_feats=False, + **kwargs, + ): + super(CTCHead, self).__init__() + if mid_channels is None: + weight_attr, bias_attr = get_para_bias_attr( + l2_decay=fc_decay, k=in_channels + ) + self.fc = nn.Linear( + in_channels, out_channels, weight_attr=weight_attr, bias_attr=bias_attr + ) + else: + weight_attr1, bias_attr1 = get_para_bias_attr( + l2_decay=fc_decay, k=in_channels + ) + self.fc1 = nn.Linear( + in_channels, + mid_channels, + weight_attr=weight_attr1, + bias_attr=bias_attr1, + ) + + weight_attr2, bias_attr2 = get_para_bias_attr( + l2_decay=fc_decay, k=mid_channels + ) + self.fc2 = nn.Linear( + mid_channels, + out_channels, + weight_attr=weight_attr2, + bias_attr=bias_attr2, + ) + self.out_channels = out_channels + self.mid_channels = mid_channels + self.return_feats = return_feats + + def forward(self, x, targets=None): + if self.mid_channels is None: + predicts = self.fc(x) + else: + x = self.fc1(x) + predicts = self.fc2(x) + + if self.return_feats: + result = (x, predicts) + else: + result = predicts + if not self.training: + predicts = F.softmax(predicts, axis=2) + result = predicts + + return result diff --git a/docling_ibm_models/slanet_1m/modeling/heads/rec_multi_head.py b/docling_ibm_models/slanet_1m/modeling/heads/rec_multi_head.py new file mode 100644 index 0000000..a62ae40 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/heads/rec_multi_head.py @@ -0,0 +1,152 @@ +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +from paddle import ParamAttr +import paddle.nn as nn +import paddle.nn.functional as F + +from modeling.necks.rnn import ( + Im2Seq, + EncoderWithRNN, + EncoderWithFC, + SequenceEncoder, + EncoderWithSVTR, + trunc_normal_, + zeros_, +) +from .rec_ctc_head import CTCHead +from .rec_nrtr_head import Transformer + + +class FCTranspose(nn.Layer): + def __init__(self, in_channels, out_channels, only_transpose=False): + super().__init__() + self.only_transpose = only_transpose + if not self.only_transpose: + self.fc = nn.Linear(in_channels, out_channels, bias_attr=False) + + def forward(self, x): + if self.only_transpose: + return x.transpose([0, 2, 1]) + else: + return self.fc(x.transpose([0, 2, 1])) + + +class AddPos(nn.Layer): + def __init__(self, dim, w): + super().__init__() + self.dec_pos_embed = self.create_parameter( + shape=[1, w, dim], default_initializer=zeros_ + ) + self.add_parameter("dec_pos_embed", self.dec_pos_embed) + trunc_normal_(self.dec_pos_embed) + + def forward(self, x): + x = x + self.dec_pos_embed[:, : x.shape[1], :] + return x + + +class MultiHead(nn.Layer): + def __init__(self, in_channels, out_channels_list, **kwargs): + super().__init__() + self.head_list = kwargs.pop("head_list") + self.use_pool = kwargs.get("use_pool", False) + self.use_pos = kwargs.get("use_pos", False) + self.in_channels = in_channels + if self.use_pool: + self.pool = nn.AvgPool2D(kernel_size=[3, 2], stride=[3, 2], padding=0) + self.gtc_head = "sar" + assert len(self.head_list) >= 2 + for idx, head_name in enumerate(self.head_list): + name = list(head_name)[0] + if name == "SARHead": + # sar head + sar_args = self.head_list[idx][name] + self.sar_head = eval(name)( + in_channels=in_channels, + out_channels=out_channels_list["SARLabelDecode"], + **sar_args, + ) + elif name == "NRTRHead": + gtc_args = self.head_list[idx][name] + max_text_length = gtc_args.get("max_text_length", 25) + nrtr_dim = gtc_args.get("nrtr_dim", 256) + num_decoder_layers = gtc_args.get("num_decoder_layers", 4) + if self.use_pos: + self.before_gtc = nn.Sequential( + nn.Flatten(2), + FCTranspose(in_channels, nrtr_dim), + AddPos(nrtr_dim, 80), + ) + else: + self.before_gtc = nn.Sequential( + nn.Flatten(2), FCTranspose(in_channels, nrtr_dim) + ) + + self.gtc_head = Transformer( + d_model=nrtr_dim, + nhead=nrtr_dim // 32, + num_encoder_layers=-1, + beam_size=-1, + num_decoder_layers=num_decoder_layers, + max_len=max_text_length, + dim_feedforward=nrtr_dim * 4, + out_channels=out_channels_list["NRTRLabelDecode"], + ) + elif name == "CTCHead": + # ctc neck + self.encoder_reshape = Im2Seq(in_channels) + neck_args = self.head_list[idx][name]["Neck"] + encoder_type = neck_args.pop("name") + self.ctc_encoder = SequenceEncoder( + in_channels=in_channels, encoder_type=encoder_type, **neck_args + ) + # ctc head + head_args = self.head_list[idx][name]["Head"] + self.ctc_head = eval(name)( + in_channels=self.ctc_encoder.out_channels, + out_channels=out_channels_list["CTCLabelDecode"], + **head_args, + ) + else: + raise NotImplementedError( + "{} is not supported in MultiHead yet".format(name) + ) + + def forward(self, x, targets=None): + if self.use_pool: + x = self.pool( + x.reshape([0, 3, -1, self.in_channels]).transpose([0, 3, 1, 2]) + ) + ctc_encoder = self.ctc_encoder(x) + ctc_out = self.ctc_head(ctc_encoder, targets) + head_out = dict() + head_out["ctc"] = ctc_out + head_out["ctc_neck"] = ctc_encoder + # eval mode + if not self.training: + return ctc_out + if self.gtc_head == "sar": + sar_out = self.sar_head(x, targets[1:]) + head_out["sar"] = sar_out + else: + gtc_out = self.gtc_head(self.before_gtc(x), targets[1:]) + head_out["gtc"] = gtc_out + return head_out diff --git a/docling_ibm_models/slanet_1m/modeling/heads/rec_nrtr_head.py b/docling_ibm_models/slanet_1m/modeling/heads/rec_nrtr_head.py new file mode 100644 index 0000000..b13a849 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/heads/rec_nrtr_head.py @@ -0,0 +1,704 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import paddle +from paddle import nn +import paddle.nn.functional as F +from paddle.nn import Dropout, LayerNorm +import numpy as np +from modeling.backbones.rec_svtrnet import Mlp, zeros_ +from paddle.nn.initializer import XavierNormal as xavier_normal_ + + +class Transformer(nn.Layer): + """A transformer model. User is able to modify the attributes as needed. The architechture + is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, + Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and + Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information + Processing Systems, pages 6000-6010. + + Args: + d_model: the number of expected features in the encoder/decoder inputs (default=512). + nhead: the number of heads in the multiheadattention models (default=8). + num_encoder_layers: the number of sub-encoder-layers in the encoder (default=6). + num_decoder_layers: the number of sub-decoder-layers in the decoder (default=6). + dim_feedforward: the dimension of the feedforward network model (default=2048). + dropout: the dropout value (default=0.1). + custom_encoder: custom encoder (default=None). + custom_decoder: custom decoder (default=None). + """ + + def __init__( + self, + d_model=512, + nhead=8, + num_encoder_layers=6, + beam_size=0, + num_decoder_layers=6, + max_len=25, + dim_feedforward=1024, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1, + in_channels=0, + out_channels=0, + scale_embedding=True, + ): + super(Transformer, self).__init__() + self.out_channels = out_channels + 1 + self.max_len = max_len + self.embedding = Embeddings( + d_model=d_model, + vocab=self.out_channels, + padding_idx=0, + scale_embedding=scale_embedding, + ) + self.positional_encoding = PositionalEncoding( + dropout=residual_dropout_rate, dim=d_model + ) + + if num_encoder_layers > 0: + self.encoder = nn.LayerList( + [ + TransformerBlock( + d_model, + nhead, + dim_feedforward, + attention_dropout_rate, + residual_dropout_rate, + with_self_attn=True, + with_cross_attn=False, + ) + for i in range(num_encoder_layers) + ] + ) + else: + self.encoder = None + + self.decoder = nn.LayerList( + [ + TransformerBlock( + d_model, + nhead, + dim_feedforward, + attention_dropout_rate, + residual_dropout_rate, + with_self_attn=True, + with_cross_attn=True, + ) + for i in range(num_decoder_layers) + ] + ) + + self.beam_size = beam_size + self.d_model = d_model + self.nhead = nhead + self.tgt_word_prj = nn.Linear(d_model, self.out_channels, bias_attr=False) + w0 = np.random.normal( + 0.0, d_model**-0.5, (d_model, self.out_channels) + ).astype(np.float32) + self.tgt_word_prj.weight.set_value(w0) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + xavier_normal_(m.weight) + if m.bias is not None: + zeros_(m.bias) + + def forward_train(self, src, tgt): + tgt = tgt[:, :-1] + + tgt = self.embedding(tgt) + tgt = self.positional_encoding(tgt) + tgt_mask = self.generate_square_subsequent_mask(tgt.shape[1]) + + if self.encoder is not None: + src = self.positional_encoding(src) + for encoder_layer in self.encoder: + src = encoder_layer(src) + memory = src # B N C + else: + memory = src # B N C + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, memory, self_mask=tgt_mask) + output = tgt + logit = self.tgt_word_prj(output) + return logit + + def forward(self, src, targets=None): + """Take in and process masked source/target sequences. + Args: + src: the sequence to the encoder (required). + tgt: the sequence to the decoder (required). + Shape: + - src: :math:`(B, sN, C)`. + - tgt: :math:`(B, tN, C)`. + Examples: + >>> output = transformer_model(src, tgt) + """ + + if self.training: + max_len = targets[1].max() + tgt = targets[0][:, : 2 + max_len] + return self.forward_train(src, tgt) + else: + if self.beam_size > 0: + return self.forward_beam(src) + else: + return self.forward_test(src) + + def forward_test(self, src): + bs = src.shape[0] + if self.encoder is not None: + src = self.positional_encoding(src) + for encoder_layer in self.encoder: + src = encoder_layer(src) + memory = src # B N C + else: + memory = src + dec_seq = paddle.full((bs, 1), 2, dtype=paddle.int64) + dec_prob = paddle.full((bs, 1), 1.0, dtype=paddle.float32) + for len_dec_seq in range(1, paddle.to_tensor(self.max_len)): + dec_seq_embed = self.embedding(dec_seq) + dec_seq_embed = self.positional_encoding(dec_seq_embed) + tgt_mask = self.generate_square_subsequent_mask(dec_seq_embed.shape[1]) + tgt = dec_seq_embed + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, memory, self_mask=tgt_mask) + dec_output = tgt + dec_output = dec_output[:, -1, :] + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=-1) + preds_idx = paddle.argmax(word_prob, axis=-1) + if paddle.equal_all( + preds_idx, paddle.full(preds_idx.shape, 3, dtype="int64") + ): + break + preds_prob = paddle.max(word_prob, axis=-1) + dec_seq = paddle.concat( + [dec_seq, paddle.reshape(preds_idx, [-1, 1])], axis=1 + ) + dec_prob = paddle.concat( + [dec_prob, paddle.reshape(preds_prob, [-1, 1])], axis=1 + ) + return [dec_seq, dec_prob] + + def forward_beam(self, images): + """Translation work in one batch""" + + def get_inst_idx_to_tensor_position_map(inst_idx_list): + """Indicate the position of an instance in a tensor.""" + return { + inst_idx: tensor_position + for tensor_position, inst_idx in enumerate(inst_idx_list) + } + + def collect_active_part( + beamed_tensor, curr_active_inst_idx, n_prev_active_inst, n_bm + ): + """Collect tensor parts associated to active instances.""" + + beamed_tensor_shape = beamed_tensor.shape + n_curr_active_inst = len(curr_active_inst_idx) + new_shape = ( + n_curr_active_inst * n_bm, + beamed_tensor_shape[1], + beamed_tensor_shape[2], + ) + + beamed_tensor = beamed_tensor.reshape([n_prev_active_inst, -1]) + beamed_tensor = beamed_tensor.index_select(curr_active_inst_idx, axis=0) + beamed_tensor = beamed_tensor.reshape(new_shape) + + return beamed_tensor + + def collate_active_info( + src_enc, inst_idx_to_position_map, active_inst_idx_list + ): + # Sentences which are still active are collected, + # so the decoder will not run on completed sentences. + + n_prev_active_inst = len(inst_idx_to_position_map) + active_inst_idx = [ + inst_idx_to_position_map[k] for k in active_inst_idx_list + ] + active_inst_idx = paddle.to_tensor(active_inst_idx, dtype="int64") + active_src_enc = collect_active_part( + src_enc.transpose([1, 0, 2]), active_inst_idx, n_prev_active_inst, n_bm + ).transpose([1, 0, 2]) + active_inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list + ) + return active_src_enc, active_inst_idx_to_position_map + + def beam_decode_step( + inst_dec_beams, len_dec_seq, enc_output, inst_idx_to_position_map, n_bm + ): + """Decode and update beam status, and then return active beam idx""" + + def prepare_beam_dec_seq(inst_dec_beams, len_dec_seq): + dec_partial_seq = [ + b.get_current_state() for b in inst_dec_beams if not b.done + ] + dec_partial_seq = paddle.stack(dec_partial_seq) + dec_partial_seq = dec_partial_seq.reshape([-1, len_dec_seq]) + return dec_partial_seq + + def predict_word(dec_seq, enc_output, n_active_inst, n_bm): + dec_seq = self.embedding(dec_seq) + dec_seq = self.positional_encoding(dec_seq) + tgt_mask = self.generate_square_subsequent_mask(dec_seq.shape[1]) + tgt = dec_seq + for decoder_layer in self.decoder: + tgt = decoder_layer(tgt, enc_output, self_mask=tgt_mask) + dec_output = tgt + dec_output = dec_output[:, -1, :] # Pick the last step: (bh * bm) * d_h + word_prob = F.softmax(self.tgt_word_prj(dec_output), axis=1) + word_prob = paddle.reshape(word_prob, [n_active_inst, n_bm, -1]) + return word_prob + + def collect_active_inst_idx_list( + inst_beams, word_prob, inst_idx_to_position_map + ): + active_inst_idx_list = [] + for inst_idx, inst_position in inst_idx_to_position_map.items(): + is_inst_complete = inst_beams[inst_idx].advance( + word_prob[inst_position] + ) + if not is_inst_complete: + active_inst_idx_list += [inst_idx] + + return active_inst_idx_list + + n_active_inst = len(inst_idx_to_position_map) + dec_seq = prepare_beam_dec_seq(inst_dec_beams, len_dec_seq) + word_prob = predict_word(dec_seq, enc_output, n_active_inst, n_bm) + # Update the beam with predicted word prob information and collect incomplete instances + active_inst_idx_list = collect_active_inst_idx_list( + inst_dec_beams, word_prob, inst_idx_to_position_map + ) + return active_inst_idx_list + + def collect_hypothesis_and_scores(inst_dec_beams, n_best): + all_hyp, all_scores = [], [] + for inst_idx in range(len(inst_dec_beams)): + scores, tail_idxs = inst_dec_beams[inst_idx].sort_scores() + all_scores += [scores[:n_best]] + hyps = [ + inst_dec_beams[inst_idx].get_hypothesis(i) + for i in tail_idxs[:n_best] + ] + all_hyp += [hyps] + return all_hyp, all_scores + + with paddle.no_grad(): + # -- Encode + if self.encoder is not None: + src = self.positional_encoding(images) + src_enc = self.encoder(src) + else: + src_enc = images + + n_bm = self.beam_size + src_shape = src_enc.shape + inst_dec_beams = [Beam(n_bm) for _ in range(1)] + active_inst_idx_list = list(range(1)) + # Repeat data for beam search + src_enc = paddle.tile(src_enc, [1, n_bm, 1]) + inst_idx_to_position_map = get_inst_idx_to_tensor_position_map( + active_inst_idx_list + ) + # Decode + for len_dec_seq in range(1, paddle.to_tensor(self.max_len)): + src_enc_copy = src_enc.clone() + active_inst_idx_list = beam_decode_step( + inst_dec_beams, + len_dec_seq, + src_enc_copy, + inst_idx_to_position_map, + n_bm, + ) + if not active_inst_idx_list: + break # all instances have finished their path to + src_enc, inst_idx_to_position_map = collate_active_info( + src_enc_copy, inst_idx_to_position_map, active_inst_idx_list + ) + batch_hyp, batch_scores = collect_hypothesis_and_scores(inst_dec_beams, 1) + result_hyp = [] + hyp_scores = [] + for bs_hyp, score in zip(batch_hyp, batch_scores): + l = len(bs_hyp[0]) + bs_hyp_pad = bs_hyp[0] + [3] * (25 - l) + result_hyp.append(bs_hyp_pad) + score = float(score) / l + hyp_score = [score for _ in range(25)] + hyp_scores.append(hyp_score) + return [ + paddle.to_tensor(np.array(result_hyp), dtype=paddle.int64), + paddle.to_tensor(hyp_scores), + ] + + def generate_square_subsequent_mask(self, sz): + """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). + Unmasked positions are filled with float(0.0). + """ + mask = paddle.zeros([sz, sz], dtype="float32") + mask_inf = paddle.triu( + paddle.full(shape=[sz, sz], dtype="float32", fill_value="-inf"), diagonal=1 + ) + mask = mask + mask_inf + return mask.unsqueeze([0, 1]) + + +class MultiheadAttention(nn.Layer): + """Allows the model to jointly attend to information + from different representation subspaces. + See reference: Attention Is All You Need + + .. math:: + \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O + \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) + + Args: + embed_dim: total dimension of the model + num_heads: parallel attention layers, or heads + + """ + + def __init__(self, embed_dim, num_heads, dropout=0.0, self_attn=False): + super(MultiheadAttention, self).__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + # self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + self.scale = self.head_dim**-0.5 + self.self_attn = self_attn + if self_attn: + self.qkv = nn.Linear(embed_dim, embed_dim * 3) + else: + self.q = nn.Linear(embed_dim, embed_dim) + self.kv = nn.Linear(embed_dim, embed_dim * 2) + self.attn_drop = nn.Dropout(dropout) + self.out_proj = nn.Linear(embed_dim, embed_dim) + + def forward(self, query, key=None, attn_mask=None): + qN = query.shape[1] + + if self.self_attn: + qkv = ( + self.qkv(query) + .reshape((0, qN, 3, self.num_heads, self.head_dim)) + .transpose((2, 0, 3, 1, 4)) + ) + q, k, v = qkv[0], qkv[1], qkv[2] + else: + kN = key.shape[1] + q = ( + self.q(query) + .reshape([0, qN, self.num_heads, self.head_dim]) + .transpose([0, 2, 1, 3]) + ) + kv = ( + self.kv(key) + .reshape((0, kN, 2, self.num_heads, self.head_dim)) + .transpose((2, 0, 3, 1, 4)) + ) + k, v = kv[0], kv[1] + + attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale + + if attn_mask is not None: + attn += attn_mask + + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, qN, self.embed_dim)) + x = self.out_proj(x) + + return x + + +class TransformerBlock(nn.Layer): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + attention_dropout_rate=0.0, + residual_dropout_rate=0.1, + with_self_attn=True, + with_cross_attn=False, + epsilon=1e-5, + ): + super(TransformerBlock, self).__init__() + self.with_self_attn = with_self_attn + if with_self_attn: + self.self_attn = MultiheadAttention( + d_model, nhead, dropout=attention_dropout_rate, self_attn=with_self_attn + ) + self.norm1 = LayerNorm(d_model, epsilon=epsilon) + self.dropout1 = Dropout(residual_dropout_rate) + self.with_cross_attn = with_cross_attn + if with_cross_attn: + self.cross_attn = ( + MultiheadAttention( # for self_attn of encoder or cross_attn of decoder + d_model, nhead, dropout=attention_dropout_rate + ) + ) + self.norm2 = LayerNorm(d_model, epsilon=epsilon) + self.dropout2 = Dropout(residual_dropout_rate) + + self.mlp = Mlp( + in_features=d_model, + hidden_features=dim_feedforward, + act_layer=nn.ReLU, + drop=residual_dropout_rate, + ) + + self.norm3 = LayerNorm(d_model, epsilon=epsilon) + + self.dropout3 = Dropout(residual_dropout_rate) + + def forward(self, tgt, memory=None, self_mask=None, cross_mask=None): + if self.with_self_attn: + tgt1 = self.self_attn(tgt, attn_mask=self_mask) + tgt = self.norm1(tgt + self.dropout1(tgt1)) + + if self.with_cross_attn: + tgt2 = self.cross_attn(tgt, key=memory, attn_mask=cross_mask) + tgt = self.norm2(tgt + self.dropout2(tgt2)) + tgt = self.norm3(tgt + self.dropout3(self.mlp(tgt))) + return tgt + + +class PositionalEncoding(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype("float32") * (-math.log(10000.0) / dim) + ) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.unsqueeze(pe, 0) + pe = paddle.transpose(pe, [1, 0, 2]) + self.register_buffer("pe", pe) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + x = x.transpose([1, 0, 2]) + x = x + self.pe[: x.shape[0], :] + return self.dropout(x).transpose([1, 0, 2]) + + +class PositionalEncoding_2d(nn.Layer): + """Inject some information about the relative or absolute position of the tokens + in the sequence. The positional encodings have the same dimension as + the embeddings, so that the two can be summed. Here, we use sine and cosine + functions of different frequencies. + .. math:: + \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model)) + \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model)) + \text{where pos is the word position and i is the embed idx) + Args: + d_model: the embed dim (required). + dropout: the dropout value (default=0.1). + max_len: the max. length of the incoming sequence (default=5000). + Examples: + >>> pos_encoder = PositionalEncoding(d_model) + """ + + def __init__(self, dropout, dim, max_len=5000): + super(PositionalEncoding_2d, self).__init__() + self.dropout = nn.Dropout(p=dropout) + + pe = paddle.zeros([max_len, dim]) + position = paddle.arange(0, max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, dim, 2).astype("float32") * (-math.log(10000.0) / dim) + ) + pe[:, 0::2] = paddle.sin(position * div_term) + pe[:, 1::2] = paddle.cos(position * div_term) + pe = paddle.transpose(paddle.unsqueeze(pe, 0), [1, 0, 2]) + self.register_buffer("pe", pe) + + self.avg_pool_1 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear1 = nn.Linear(dim, dim) + self.linear1.weight.data.fill_(1.0) + self.avg_pool_2 = nn.AdaptiveAvgPool2D((1, 1)) + self.linear2 = nn.Linear(dim, dim) + self.linear2.weight.data.fill_(1.0) + + def forward(self, x): + """Inputs of forward function + Args: + x: the sequence fed to the positional encoder model (required). + Shape: + x: [sequence length, batch size, embed dim] + output: [sequence length, batch size, embed dim] + Examples: + >>> output = pos_encoder(x) + """ + w_pe = self.pe[: x.shape[-1], :] + w1 = self.linear1(self.avg_pool_1(x).squeeze()).unsqueeze(0) + w_pe = w_pe * w1 + w_pe = paddle.transpose(w_pe, [1, 2, 0]) + w_pe = paddle.unsqueeze(w_pe, 2) + + h_pe = self.pe[: x.shape.shape[-2], :] + w2 = self.linear2(self.avg_pool_2(x).squeeze()).unsqueeze(0) + h_pe = h_pe * w2 + h_pe = paddle.transpose(h_pe, [1, 2, 0]) + h_pe = paddle.unsqueeze(h_pe, 3) + + x = x + w_pe + h_pe + x = paddle.transpose( + paddle.reshape(x, [x.shape[0], x.shape[1], x.shape[2] * x.shape[3]]), + [2, 0, 1], + ) + + return self.dropout(x) + + +class Embeddings(nn.Layer): + def __init__(self, d_model, vocab, padding_idx=None, scale_embedding=True): + super(Embeddings, self).__init__() + self.embedding = nn.Embedding(vocab, d_model, padding_idx=padding_idx) + w0 = np.random.normal(0.0, d_model**-0.5, (vocab, d_model)).astype(np.float32) + self.embedding.weight.set_value(w0) + self.d_model = d_model + self.scale_embedding = scale_embedding + + def forward(self, x): + if self.scale_embedding: + x = self.embedding(x) + return x * math.sqrt(self.d_model) + return self.embedding(x) + + +class Beam: + """Beam search""" + + def __init__(self, size, device=False): + self.size = size + self._done = False + # The score for each translation on the beam. + self.scores = paddle.zeros((size,), dtype=paddle.float32) + self.all_scores = [] + # The backpointers at each time-step. + self.prev_ks = [] + # The outputs at each time-step. + self.next_ys = [paddle.full((size,), 0, dtype=paddle.int64)] + self.next_ys[0][0] = 2 + + def get_current_state(self): + "Get the outputs for the current timestep." + return self.get_tentative_hypothesis() + + def get_current_origin(self): + "Get the backpointers for the current timestep." + return self.prev_ks[-1] + + @property + def done(self): + return self._done + + def advance(self, word_prob): + "Update beam status and check if finished or not." + num_words = word_prob.shape[1] + + # Sum the previous scores. + if len(self.prev_ks) > 0: + beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) + else: + beam_lk = word_prob[0] + + flat_beam_lk = beam_lk.reshape([-1]) + best_scores, best_scores_id = flat_beam_lk.topk( + self.size, 0, True, True + ) # 1st sort + self.all_scores.append(self.scores) + self.scores = best_scores + # bestScoresId is flattened as a (beam x word) array, + # so we need to calculate which word and beam each score came from + prev_k = best_scores_id // num_words + self.prev_ks.append(prev_k) + self.next_ys.append(best_scores_id - prev_k * num_words) + # End condition is when top-of-beam is EOS. + if self.next_ys[-1][0] == 3: + self._done = True + self.all_scores.append(self.scores) + + return self._done + + def sort_scores(self): + "Sort the scores." + return self.scores, paddle.to_tensor( + [i for i in range(int(self.scores.shape[0]))], dtype="int32" + ) + + def get_the_best_score_and_idx(self): + "Get the score of the best in the beam." + scores, ids = self.sort_scores() + return scores[1], ids[1] + + def get_tentative_hypothesis(self): + "Get the decoded sequence for the current timestep." + if len(self.next_ys) == 1: + dec_seq = self.next_ys[0].unsqueeze(1) + else: + _, keys = self.sort_scores() + hyps = [self.get_hypothesis(k) for k in keys] + hyps = [[2] + h for h in hyps] + dec_seq = paddle.to_tensor(hyps, dtype="int64") + return dec_seq + + def get_hypothesis(self, k): + """Walk back to construct the full hypothesis.""" + hyp = [] + for j in range(len(self.prev_ks) - 1, -1, -1): + hyp.append(self.next_ys[j + 1][k]) + k = self.prev_ks[j][k] + return list(map(lambda x: x.item(), hyp[::-1])) diff --git a/docling_ibm_models/slanet_1m/modeling/heads/table_att_head.py b/docling_ibm_models/slanet_1m/modeling/heads/table_att_head.py new file mode 100644 index 0000000..4202283 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/heads/table_att_head.py @@ -0,0 +1,413 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import paddle +import paddle.nn as nn +from paddle import ParamAttr +import paddle.nn.functional as F +import numpy as np + +from .rec_att_head import AttentionGRUCell +from modeling.backbones.rec_svtrnet import DropPath, Identity, Mlp + + +def get_para_bias_attr(l2_decay, k): + if l2_decay > 0: + regularizer = paddle.regularizer.L2Decay(l2_decay) + stdv = 1.0 / math.sqrt(k * 1.0) + initializer = nn.initializer.Uniform(-stdv, stdv) + else: + regularizer = None + initializer = None + weight_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + bias_attr = ParamAttr(regularizer=regularizer, initializer=initializer) + return [weight_attr, bias_attr] + + +class TableAttentionHead(nn.Layer): + def __init__( + self, + in_channels, + hidden_size, + in_max_len=488, + max_text_length=800, + out_channels=30, + loc_reg_num=4, + **kwargs, + ): + super(TableAttentionHead, self).__init__() + self.input_size = in_channels[-1] + self.hidden_size = hidden_size + self.out_channels = out_channels + self.max_text_length = max_text_length + + self.structure_attention_cell = AttentionGRUCell( + self.input_size, hidden_size, self.out_channels, use_gru=False + ) + self.structure_generator = nn.Linear(hidden_size, self.out_channels) + self.in_max_len = in_max_len + + if self.in_max_len == 640: + self.loc_fea_trans = nn.Linear(400, self.max_text_length + 1) + elif self.in_max_len == 800: + self.loc_fea_trans = nn.Linear(625, self.max_text_length + 1) + else: + self.loc_fea_trans = nn.Linear(256, self.max_text_length + 1) + self.loc_generator = nn.Linear(self.input_size + hidden_size, loc_reg_num) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None): + # if and else branch are both needed when you want to assign a variable + # if you modify the var in just one branch, then the modification will not work. + fea = inputs[-1] + last_shape = int(np.prod(fea.shape[2:])) # gry added + fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], last_shape]) + fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + batch_size = fea.shape[0] + + hidden = paddle.zeros((batch_size, self.hidden_size)) + output_hiddens = paddle.zeros( + (batch_size, self.max_text_length + 1, self.hidden_size) + ) + if self.training and targets is not None: + structure = targets[0] + for i in range(self.max_text_length + 1): + elem_onehots = self._char_to_onehot( + structure[:, i], onehot_dim=self.out_channels + ) + (outputs, hidden), alpha = self.structure_attention_cell( + hidden, fea, elem_onehots + ) + output_hiddens[:, i, :] = outputs + structure_probs = self.structure_generator(output_hiddens) + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) + else: + temp_elem = paddle.zeros(shape=[batch_size], dtype="int32") + structure_probs = None + loc_preds = None + elem_onehots = None + outputs = None + alpha = None + max_text_length = paddle.to_tensor(self.max_text_length) + for i in range(max_text_length + 1): + elem_onehots = self._char_to_onehot( + temp_elem, onehot_dim=self.out_channels + ) + (outputs, hidden), alpha = self.structure_attention_cell( + hidden, fea, elem_onehots + ) + output_hiddens[:, i, :] = outputs + structure_probs_step = self.structure_generator(outputs) + temp_elem = structure_probs_step.argmax(axis=1, dtype="int32") + + structure_probs = self.structure_generator(output_hiddens) + structure_probs = F.softmax(structure_probs) + loc_fea = fea.transpose([0, 2, 1]) + loc_fea = self.loc_fea_trans(loc_fea) + loc_fea = loc_fea.transpose([0, 2, 1]) + loc_concat = paddle.concat([output_hiddens, loc_fea], axis=2) + loc_preds = self.loc_generator(loc_concat) + loc_preds = F.sigmoid(loc_preds) + return {"structure_probs": structure_probs, "loc_preds": loc_preds} + + +class HWAttention(nn.Layer): + def __init__( + self, + head_dim=32, + qk_scale=None, + attn_drop=0.0, + ): + super().__init__() + self.head_dim = head_dim + self.scale = qk_scale or self.head_dim**-0.5 + self.attn_drop = nn.Dropout(attn_drop) + + def forward(self, x): + B, N, C = x.shape + C = C // 3 + qkv = x.reshape([B, N, 3, C // self.head_dim, self.head_dim]).transpose( + [2, 0, 3, 1, 4] + ) + q, k, v = qkv.unbind(0) + attn = q @ k.transpose([0, 1, 3, 2]) * self.scale + attn = F.softmax(attn, -1) + attn = self.attn_drop(attn) + x = attn @ v + x = x.transpose([0, 2, 1]).reshape([B, N, C]) + return x + + +def img2windows(img, H_sp, W_sp): + """ + img: B C H W + """ + B, H, W, C = img.shape + img_reshape = img.reshape([B, H // H_sp, H_sp, W // W_sp, W_sp, C]) + img_perm = img_reshape.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H_sp * W_sp, C]) + return img_perm + + +def windows2img(img_splits_hw, H_sp, W_sp, H, W): + """ + img_splits_hw: B' H W C + """ + B = int(img_splits_hw.shape[0] / (H * W / H_sp / W_sp)) + + img = img_splits_hw.reshape([B, H // H_sp, W // W_sp, H_sp, W_sp, -1]) + img = img.transpose([0, 1, 3, 2, 4, 5]).flatten(1, 4) + return img + + +class Block(nn.Layer): + def __init__( + self, + dim, + num_heads, + split_h=4, + split_w=4, + h_num_heads=None, + w_num_heads=None, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + eps=1e-6, + ): + super().__init__() + self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) + self.proj = nn.Linear(dim, dim) + self.split_h = split_h + self.split_w = split_w + mlp_hidden_dim = int(dim * mlp_ratio) + self.norm1 = norm_layer(dim, epsilon=eps) + self.h_num_heads = h_num_heads if h_num_heads is not None else num_heads // 2 + self.w_num_heads = w_num_heads if w_num_heads is not None else num_heads // 2 + self.head_dim = dim // num_heads + self.mixer = HWAttention( + head_dim=dim // num_heads, + qk_scale=qk_scale, + attn_drop=attn_drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() + self.norm2 = norm_layer(dim, epsilon=eps) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x): + B, C, H, W = x.shape + x = x.flatten(2).transpose([0, 2, 1]) + + qkv = self.qkv(x).reshape([B, H, W, 3 * C]) + + x1 = qkv[:, :, :, : 3 * self.h_num_heads * self.head_dim] # b, h, w, 3ch + x2 = qkv[:, :, :, 3 * self.h_num_heads * self.head_dim :] # b, h, w, 3cw + + x1 = self.mixer(img2windows(x1, self.split_h, W)) # b*splith, W, 3ch + x2 = self.mixer(img2windows(x2, H, self.split_w)) # b*splitw, h, 3ch + x1 = windows2img(x1, self.split_h, W, H, W) + x2 = windows2img(x2, H, self.split_w, H, W) + + attened_x = paddle.concat([x1, x2], 2) + attened_x = self.proj(attened_x) + + x = self.norm1(x + self.drop_path(attened_x)) + x = self.norm2(x + self.drop_path(self.mlp(x))) + x = x.transpose([0, 2, 1]).reshape([-1, C, H, W]) + return x + + +class SLAHead(nn.Layer): + def __init__( + self, + in_channels, + hidden_size, + out_channels=30, + max_text_length=500, + loc_reg_num=4, + fc_decay=0.0, + use_attn=False, + **kwargs, + ): + """ + @param in_channels: input shape + @param hidden_size: hidden_size for RNN and Embedding + @param out_channels: num_classes to rec + @param max_text_length: max text pred + """ + super().__init__() + in_channels = in_channels[-1] + self.hidden_size = hidden_size + self.max_text_length = max_text_length + self.emb = self._char_to_onehot + self.num_embeddings = out_channels + self.loc_reg_num = loc_reg_num + self.eos = self.num_embeddings - 1 + + # structure + self.structure_attention_cell = AttentionGRUCell( + in_channels, hidden_size, self.num_embeddings + ) + weight_attr, bias_attr = get_para_bias_attr(l2_decay=fc_decay, k=hidden_size) + weight_attr1_1, bias_attr1_1 = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size + ) + weight_attr1_2, bias_attr1_2 = get_para_bias_attr( + l2_decay=fc_decay, k=hidden_size + ) + self.structure_generator = nn.Sequential( + nn.Linear( + self.hidden_size, + self.hidden_size, + weight_attr=weight_attr1_2, + bias_attr=bias_attr1_2, + ), + nn.Linear( + hidden_size, out_channels, weight_attr=weight_attr, bias_attr=bias_attr + ), + ) + dpr = np.linspace(0, 0.1, 2) + + self.use_attn = use_attn + if use_attn: + layer_list = [ + Block( + in_channels, + num_heads=2, + mlp_ratio=4.0, + qkv_bias=True, + drop_path=dpr[i], + ) + for i in range(2) + ] + self.cross_atten = nn.Sequential(*layer_list) + # loc + weight_attr1, bias_attr1 = get_para_bias_attr( + l2_decay=fc_decay, k=self.hidden_size + ) + weight_attr2, bias_attr2 = get_para_bias_attr( + l2_decay=fc_decay, k=self.hidden_size + ) + self.loc_generator = nn.Sequential( + nn.Linear( + self.hidden_size, + self.hidden_size, + weight_attr=weight_attr1, + bias_attr=bias_attr1, + ), + nn.Linear( + self.hidden_size, + loc_reg_num, + weight_attr=weight_attr2, + bias_attr=bias_attr2, + ), + nn.Sigmoid(), + ) + + def forward(self, inputs, targets=None): + fea = inputs[-1] + batch_size = fea.shape[0] + if self.use_attn: + fea = fea + self.cross_atten(fea) + # reshape + fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], -1]) + fea = fea.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + + hidden = paddle.zeros((batch_size, self.hidden_size)) + structure_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.num_embeddings) + ) + loc_preds = paddle.zeros( + (batch_size, self.max_text_length + 1, self.loc_reg_num) + ) + structure_preds.stop_gradient = True + loc_preds.stop_gradient = True + + if self.training and targets is not None: + structure = targets[0] + max_len = targets[-2].max() + for i in range(max_len + 1): + hidden, structure_step, loc_step = self._decode( + structure[:, i], fea, hidden + ) + structure_preds[:, i, :] = structure_step + loc_preds[:, i, :] = loc_step + structure_preds = structure_preds[:, : max_len + 1] + loc_preds = loc_preds[:, : max_len + 1] + else: + structure_ids = paddle.zeros( + (batch_size, self.max_text_length + 1), dtype=paddle.int64 + ) + pre_chars = paddle.zeros(shape=[batch_size], dtype="int32") + max_text_length = paddle.to_tensor(self.max_text_length) + # for export + loc_step, structure_step = None, None + for i in range(max_text_length + 1): + hidden, structure_step, loc_step = self._decode(pre_chars, fea, hidden) + pre_chars = structure_step.argmax(axis=1, dtype="int32") + structure_preds[:, i, :] = structure_step + loc_preds[:, i, :] = loc_step + + structure_ids[:, i] = pre_chars + if (structure_ids == self.eos).any(-1).all(): + break + if not self.training: + structure_preds = F.softmax(structure_preds[:, : i + 1]) + loc_preds = loc_preds[:, : i + 1] + return {"structure_probs": structure_preds, "loc_preds": loc_preds} + + def _decode(self, pre_chars, features, hidden): + """ + Predict table label and coordinates for each step + @param pre_chars: Table label in previous step + @param features: + @param hidden: hidden status in previous step + @return: + """ + emb_feature = self.emb(pre_chars) + # output shape is b * self.hidden_size + (output, hidden), alpha = self.structure_attention_cell( + hidden, features, emb_feature + ) + + # structure + structure_step = self.structure_generator(output) + # loc + loc_step = self.loc_generator(output) + return hidden, structure_step, loc_step + + def _char_to_onehot(self, input_char): + input_ont_hot = F.one_hot(input_char, self.num_embeddings) + return input_ont_hot diff --git a/docling_ibm_models/slanet_1m/modeling/necks/__init__.py b/docling_ibm_models/slanet_1m/modeling/necks/__init__.py new file mode 100644 index 0000000..ef501f0 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/necks/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_neck"] + + +def build_neck(config): + from .csp_pan import CSPPAN + + support_dict = [ + "CSPPAN", + ] + + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "neck only support {}".format(support_dict) + ) + + module_class = eval(module_name)(**config) + return module_class diff --git a/docling_ibm_models/slanet_1m/modeling/necks/csp_pan.py b/docling_ibm_models/slanet_1m/modeling/necks/csp_pan.py new file mode 100644 index 0000000..5e8464d --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/necks/csp_pan.py @@ -0,0 +1,337 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The code is based on: +# https://github.com/PaddlePaddle/PaddleDetection/blob/release%2F2.3/ppdet/modeling/necks/csp_pan.py + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle import ParamAttr + +__all__ = ["CSPPAN"] + + +class ConvBNLayer(nn.Layer): + def __init__( + self, + in_channel=96, + out_channel=96, + kernel_size=3, + stride=1, + groups=1, + act="leaky_relu", + ): + super(ConvBNLayer, self).__init__() + initializer = nn.initializer.KaimingUniform() + self.act = act + assert self.act in ["leaky_relu", "hard_swish"] + self.conv = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=kernel_size, + groups=groups, + padding=(kernel_size - 1) // 2, + stride=stride, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False, + ) + self.bn = nn.BatchNorm2D(out_channel) + + def forward(self, x): + x = self.bn(self.conv(x)) + if self.act == "leaky_relu": + x = F.leaky_relu(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + return x + + +class DPModule(nn.Layer): + """ + Depth-wise and point-wise module. + Args: + in_channel (int): The input channels of this Module. + out_channel (int): The output channels of this Module. + kernel_size (int): The conv2d kernel size of this Module. + stride (int): The conv2d's stride of this Module. + act (str): The activation function of this Module, + Now support `leaky_relu` and `hard_swish`. + """ + + def __init__( + self, in_channel=96, out_channel=96, kernel_size=3, stride=1, act="leaky_relu" + ): + super(DPModule, self).__init__() + initializer = nn.initializer.KaimingUniform() + self.act = act + self.dwconv = nn.Conv2D( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=kernel_size, + groups=out_channel, + padding=(kernel_size - 1) // 2, + stride=stride, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False, + ) + self.bn1 = nn.BatchNorm2D(out_channel) + self.pwconv = nn.Conv2D( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=1, + groups=1, + padding=0, + weight_attr=ParamAttr(initializer=initializer), + bias_attr=False, + ) + self.bn2 = nn.BatchNorm2D(out_channel) + + def act_func(self, x): + if self.act == "leaky_relu": + x = F.leaky_relu(x) + elif self.act == "hard_swish": + x = F.hardswish(x) + return x + + def forward(self, x): + x = self.act_func(self.bn1(self.dwconv(x))) + x = self.act_func(self.bn2(self.pwconv(x))) + return x + + +class DarknetBottleneck(nn.Layer): + """The basic bottleneck block used in Darknet. + Each Block consists of two ConvModules and the input is added to the + final output. Each ConvModule is composed of Conv, BN, and act. + The first convLayer has filter size of 1x1 and the second one has the + filter size of 3x3. + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + expansion (int): The kernel size of the convolution. Default: 0.5 + add_identity (bool): Whether to add identity to the out. + Default: True + use_depthwise (bool): Whether to use depthwise separable convolution. + Default: False + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + expansion=0.5, + add_identity=True, + use_depthwise=False, + act="leaky_relu", + ): + super(DarknetBottleneck, self).__init__() + hidden_channels = int(out_channels * expansion) + conv_func = DPModule if use_depthwise else ConvBNLayer + self.conv1 = ConvBNLayer( + in_channel=in_channels, out_channel=hidden_channels, kernel_size=1, act=act + ) + self.conv2 = conv_func( + in_channel=hidden_channels, + out_channel=out_channels, + kernel_size=kernel_size, + stride=1, + act=act, + ) + self.add_identity = add_identity and in_channels == out_channels + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.conv2(out) + + if self.add_identity: + return out + identity + else: + return out + + +class CSPLayer(nn.Layer): + """Cross Stage Partial Layer. + Args: + in_channels (int): The input channels of the CSP layer. + out_channels (int): The output channels of the CSP layer. + expand_ratio (float): Ratio to adjust the number of channels of the + hidden layer. Default: 0.5 + num_blocks (int): Number of blocks. Default: 1 + add_identity (bool): Whether to add identity in blocks. + Default: True + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: False + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + expand_ratio=0.5, + num_blocks=1, + add_identity=True, + use_depthwise=False, + act="leaky_relu", + ): + super().__init__() + mid_channels = int(out_channels * expand_ratio) + self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) + self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) + self.final_conv = ConvBNLayer(2 * mid_channels, out_channels, 1, act=act) + + self.blocks = nn.Sequential( + *[ + DarknetBottleneck( + mid_channels, + mid_channels, + kernel_size, + 1.0, + add_identity, + use_depthwise, + act=act, + ) + for _ in range(num_blocks) + ] + ) + + def forward(self, x): + x_short = self.short_conv(x) + + x_main = self.main_conv(x) + x_main = self.blocks(x_main) + + x_final = paddle.concat((x_main, x_short), axis=1) + return self.final_conv(x_final) + + +class Channel_T(nn.Layer): + def __init__(self, in_channels=[116, 232, 464], out_channels=96, act="leaky_relu"): + super(Channel_T, self).__init__() + self.convs = nn.LayerList() + for i in range(len(in_channels)): + self.convs.append(ConvBNLayer(in_channels[i], out_channels, 1, act=act)) + + def forward(self, x): + outs = [self.convs[i](x[i]) for i in range(len(x))] + return outs + + +class CSPPAN(nn.Layer): + """Path Aggregation Network with CSP module. + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + kernel_size (int): The conv2d kernel size of this Module. + num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 + use_depthwise (bool): Whether to depthwise separable convolution in + blocks. Default: True + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size=5, + num_csp_blocks=1, + use_depthwise=True, + act="hard_swish", + ): + super(CSPPAN, self).__init__() + self.in_channels = in_channels + self.out_channels = [out_channels] * len(in_channels) + conv_func = DPModule if use_depthwise else ConvBNLayer + + self.conv_t = Channel_T(in_channels, out_channels, act=act) + + # build top-down blocks + self.upsample = nn.Upsample(scale_factor=2, mode="nearest") + self.top_down_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1, 0, -1): + self.top_down_blocks.append( + CSPLayer( + out_channels * 2, + out_channels, + kernel_size=kernel_size, + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + act=act, + ) + ) + + # build bottom-up blocks + self.downsamples = nn.LayerList() + self.bottom_up_blocks = nn.LayerList() + for idx in range(len(in_channels) - 1): + self.downsamples.append( + conv_func( + out_channels, + out_channels, + kernel_size=kernel_size, + stride=2, + act=act, + ) + ) + self.bottom_up_blocks.append( + CSPLayer( + out_channels * 2, + out_channels, + kernel_size=kernel_size, + num_blocks=num_csp_blocks, + add_identity=False, + use_depthwise=use_depthwise, + act=act, + ) + ) + + def forward(self, inputs): + """ + Args: + inputs (tuple[Tensor]): input features. + Returns: + tuple[Tensor]: CSPPAN features. + """ + assert len(inputs) == len(self.in_channels) + inputs = self.conv_t(inputs) + + # top-down path + inner_outs = [inputs[-1]] + for idx in range(len(self.in_channels) - 1, 0, -1): + feat_heigh = inner_outs[0] + feat_low = inputs[idx - 1] + upsample_feat = F.upsample( + feat_heigh, size=feat_low.shape[2:4], mode="nearest" + ) + + inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( + paddle.concat([upsample_feat, feat_low], 1) + ) + inner_outs.insert(0, inner_out) + + # bottom-up path + outs = [inner_outs[0]] + for idx in range(len(self.in_channels) - 1): + feat_low = outs[-1] + feat_height = inner_outs[idx + 1] + downsample_feat = self.downsamples[idx](feat_low) + out = self.bottom_up_blocks[idx]( + paddle.concat([downsample_feat, feat_height], 1) + ) + outs.append(out) + + return tuple(outs) diff --git a/docling_ibm_models/slanet_1m/modeling/necks/rnn.py b/docling_ibm_models/slanet_1m/modeling/necks/rnn.py new file mode 100644 index 0000000..7f50319 --- /dev/null +++ b/docling_ibm_models/slanet_1m/modeling/necks/rnn.py @@ -0,0 +1,284 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + +from modeling.heads.rec_ctc_head import get_para_bias_attr +from modeling.backbones.rec_svtrnet import ( + Block, + ConvBNLayer, + trunc_normal_, + zeros_, + ones_, +) + + +class Im2Seq(nn.Layer): + def __init__(self, in_channels, **kwargs): + super().__init__() + self.out_channels = in_channels + + def forward(self, x): + B, C, H, W = x.shape + assert H == 1 + x = x.squeeze(axis=2) + x = x.transpose([0, 2, 1]) # (NTC)(batch, width, channels) + return x + + +class EncoderWithRNN(nn.Layer): + def __init__(self, in_channels, hidden_size): + super(EncoderWithRNN, self).__init__() + self.out_channels = hidden_size * 2 + self.lstm = nn.LSTM( + in_channels, hidden_size, direction="bidirectional", num_layers=2 + ) + + def forward(self, x): + x, _ = self.lstm(x) + return x + + +class BidirectionalLSTM(nn.Layer): + def __init__( + self, + input_size, + hidden_size, + output_size=None, + num_layers=1, + dropout=0, + direction=False, + time_major=False, + with_linear=False, + ): + super(BidirectionalLSTM, self).__init__() + self.with_linear = with_linear + self.rnn = nn.LSTM( + input_size, + hidden_size, + num_layers=num_layers, + dropout=dropout, + direction=direction, + time_major=time_major, + ) + + # text recognition the specified structure LSTM with linear + if self.with_linear: + self.linear = nn.Linear(hidden_size * 2, output_size) + + def forward(self, input_feature): + recurrent, _ = self.rnn( + input_feature + ) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) + if self.with_linear: + output = self.linear(recurrent) # batch_size x T x output_size + return output + return recurrent + + +class EncoderWithCascadeRNN(nn.Layer): + def __init__( + self, in_channels, hidden_size, out_channels, num_layers=2, with_linear=False + ): + super(EncoderWithCascadeRNN, self).__init__() + self.out_channels = out_channels[-1] + self.encoder = nn.LayerList( + [ + BidirectionalLSTM( + in_channels if i == 0 else out_channels[i - 1], + hidden_size, + output_size=out_channels[i], + num_layers=1, + direction="bidirectional", + with_linear=with_linear, + ) + for i in range(num_layers) + ] + ) + + def forward(self, x): + for i, l in enumerate(self.encoder): + x = l(x) + return x + + +class EncoderWithFC(nn.Layer): + def __init__(self, in_channels, hidden_size): + super(EncoderWithFC, self).__init__() + self.out_channels = hidden_size + weight_attr, bias_attr = get_para_bias_attr(l2_decay=0.00001, k=in_channels) + self.fc = nn.Linear( + in_channels, + hidden_size, + weight_attr=weight_attr, + bias_attr=bias_attr, + name="reduce_encoder_fea", + ) + + def forward(self, x): + x = self.fc(x) + return x + + +class EncoderWithSVTR(nn.Layer): + def __init__( + self, + in_channels, + dims=64, # XS + depth=2, + hidden_dims=120, + use_guide=False, + num_heads=8, + qkv_bias=True, + mlp_ratio=2.0, + drop_rate=0.1, + attn_drop_rate=0.1, + drop_path=0.0, + kernel_size=[3, 3], + qk_scale=None, + ): + super(EncoderWithSVTR, self).__init__() + self.depth = depth + self.use_guide = use_guide + self.conv1 = ConvBNLayer( + in_channels, + in_channels // 8, + kernel_size=kernel_size, + padding=[kernel_size[0] // 2, kernel_size[1] // 2], + act=nn.Swish, + ) + self.conv2 = ConvBNLayer( + in_channels // 8, hidden_dims, kernel_size=1, act=nn.Swish + ) + + self.svtr_block = nn.LayerList( + [ + Block( + dim=hidden_dims, + num_heads=num_heads, + mixer="Global", + HW=None, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=nn.Swish, + attn_drop=attn_drop_rate, + drop_path=drop_path, + norm_layer="nn.LayerNorm", + epsilon=1e-05, + prenorm=False, + ) + for i in range(depth) + ] + ) + self.norm = nn.LayerNorm(hidden_dims, epsilon=1e-6) + self.conv3 = ConvBNLayer(hidden_dims, in_channels, kernel_size=1, act=nn.Swish) + # last conv-nxn, the input is concat of input tensor and conv3 output tensor + self.conv4 = ConvBNLayer( + 2 * in_channels, + in_channels // 8, + kernel_size=kernel_size, + padding=[kernel_size[0] // 2, kernel_size[1] // 2], + act=nn.Swish, + ) + + self.conv1x1 = ConvBNLayer(in_channels // 8, dims, kernel_size=1, act=nn.Swish) + self.out_channels = dims + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward(self, x): + # for use guide + if self.use_guide: + z = x.clone() + z.stop_gradient = True + else: + z = x + # for short cut + h = z + # reduce dim + z = self.conv1(z) + z = self.conv2(z) + # SVTR global block + B, C, H, W = z.shape + z = z.flatten(2).transpose([0, 2, 1]) + for blk in self.svtr_block: + z = blk(z) + z = self.norm(z) + # last stage + z = z.reshape([0, H, W, C]).transpose([0, 3, 1, 2]) + z = self.conv3(z) + z = paddle.concat((h, z), axis=1) + z = self.conv1x1(self.conv4(z)) + return z + + +class SequenceEncoder(nn.Layer): + def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs): + super(SequenceEncoder, self).__init__() + self.encoder_reshape = Im2Seq(in_channels) + self.out_channels = self.encoder_reshape.out_channels + self.encoder_type = encoder_type + if encoder_type == "reshape": + self.only_reshape = True + else: + support_encoder_dict = { + "reshape": Im2Seq, + "fc": EncoderWithFC, + "rnn": EncoderWithRNN, + "svtr": EncoderWithSVTR, + "cascadernn": EncoderWithCascadeRNN, + } + assert encoder_type in support_encoder_dict, "{} must in {}".format( + encoder_type, support_encoder_dict.keys() + ) + if encoder_type == "svtr": + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, **kwargs + ) + elif encoder_type == "cascadernn": + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, hidden_size, **kwargs + ) + else: + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, hidden_size + ) + self.out_channels = self.encoder.out_channels + self.only_reshape = False + + def forward(self, x): + if self.encoder_type != "svtr": + x = self.encoder_reshape(x) + if not self.only_reshape: + x = self.encoder(x) + return x + else: + x = self.encoder(x) + x = self.encoder_reshape(x) + return x diff --git a/docling_ibm_models/slanet_1m/optimizer/__init__.py b/docling_ibm_models/slanet_1m/optimizer/__init__.py new file mode 100644 index 0000000..a191a4b --- /dev/null +++ b/docling_ibm_models/slanet_1m/optimizer/__init__.py @@ -0,0 +1,66 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import copy +import paddle + +__all__ = ["build_optimizer"] + + +def build_lr_scheduler(lr_config, epochs, step_each_epoch): + from . import learning_rate + + lr_config.update({"epochs": epochs, "step_each_epoch": step_each_epoch}) + lr_name = lr_config.pop("name", "Const") + lr = getattr(learning_rate, lr_name)(**lr_config)() + return lr + + +def build_optimizer(config, epochs, step_each_epoch, model): + from . import regularizer, optimizer + + config = copy.deepcopy(config) + # step1 build lr + lr = build_lr_scheduler(config.pop("lr"), epochs, step_each_epoch) + + # step2 build regularization + if "regularizer" in config and config["regularizer"] is not None: + reg_config = config.pop("regularizer") + reg_name = reg_config.pop("name") + if not hasattr(regularizer, reg_name): + reg_name += "Decay" + reg = getattr(regularizer, reg_name)(**reg_config)() + elif "weight_decay" in config: + reg = config.pop("weight_decay") + else: + reg = None + + # step3 build optimizer + optim_name = config.pop("name") + if "clip_norm" in config: + clip_norm = config.pop("clip_norm") + grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm) + elif "clip_norm_global" in config: + clip_norm = config.pop("clip_norm_global") + grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=clip_norm) + else: + grad_clip = None + optim = getattr(optimizer, optim_name)( + learning_rate=lr, weight_decay=reg, grad_clip=grad_clip, **config + ) + return optim(model), lr diff --git a/docling_ibm_models/slanet_1m/optimizer/learning_rate.py b/docling_ibm_models/slanet_1m/optimizer/learning_rate.py new file mode 100644 index 0000000..687a145 --- /dev/null +++ b/docling_ibm_models/slanet_1m/optimizer/learning_rate.py @@ -0,0 +1,454 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from paddle.optimizer import lr +from .lr_scheduler import CyclicalCosineDecay, OneCycleDecay, TwoStepCosineDecay + + +class Linear(object): + """ + Linear learning rate decay + Args: + lr (float): The initial learning rate. It is a python float number. + epochs(int): The decay step size. It determines the decay cycle. + end_lr(float, optional): The minimum final learning rate. Default: 0.0001. + power(float, optional): Power of polynomial. Default: 1.0. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + learning_rate, + epochs, + step_each_epoch, + end_lr=0.0, + power=1.0, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(Linear, self).__init__() + self.learning_rate = learning_rate + self.epochs = epochs * step_each_epoch + self.end_lr = end_lr + self.power = power + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.PolynomialDecay( + learning_rate=self.learning_rate, + decay_steps=self.epochs, + end_lr=self.end_lr, + power=self.power, + last_epoch=self.last_epoch, + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch, + ) + return learning_rate + + +class Cosine(object): + """ + Cosine learning rate decay + lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1) + Args: + lr(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + learning_rate, + step_each_epoch, + epochs, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(Cosine, self).__init__() + self.learning_rate = learning_rate + self.T_max = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.CosineAnnealingDecay( + learning_rate=self.learning_rate, + T_max=self.T_max, + last_epoch=self.last_epoch, + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch, + ) + return learning_rate + + +class Step(object): + """ + Piecewise learning rate decay + Args: + step_each_epoch(int): steps each epoch + learning_rate (float): The initial learning rate. It is a python float number. + step_size (int): the interval to update. + gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . + It should be less than 1.0. Default: 0.1. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + learning_rate, + step_size, + step_each_epoch, + gamma, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(Step, self).__init__() + self.step_size = step_each_epoch * step_size + self.learning_rate = learning_rate + self.gamma = gamma + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.StepDecay( + learning_rate=self.learning_rate, + step_size=self.step_size, + gamma=self.gamma, + last_epoch=self.last_epoch, + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch, + ) + return learning_rate + + +class Piecewise(object): + """ + Piecewise learning rate decay + Args: + boundaries(list): A list of steps numbers. The type of element in the list is python int. + values(list): A list of learning rate values that will be picked during different epoch boundaries. + The type of element in the list is python float. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + step_each_epoch, + decay_epochs, + values, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(Piecewise, self).__init__() + self.boundaries = [step_each_epoch * e for e in decay_epochs] + self.values = values + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.PiecewiseDecay( + boundaries=self.boundaries, values=self.values, last_epoch=self.last_epoch + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.values[0], + last_epoch=self.last_epoch, + ) + return learning_rate + + +class CyclicalCosine(object): + """ + Cyclical cosine learning rate decay + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + cycle(int): period of the cosine learning rate + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + learning_rate, + step_each_epoch, + epochs, + cycle, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(CyclicalCosine, self).__init__() + self.learning_rate = learning_rate + self.T_max = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + self.cycle = round(cycle * step_each_epoch) + + def __call__(self): + learning_rate = CyclicalCosineDecay( + learning_rate=self.learning_rate, + T_max=self.T_max, + cycle=self.cycle, + last_epoch=self.last_epoch, + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch, + ) + return learning_rate + + +class OneCycle(object): + """ + One Cycle learning rate decay + Args: + max_lr(float): Upper learning rate boundaries + epochs(int): total training epochs + step_each_epoch(int): steps each epoch + anneal_strategy(str): {‘cos’, ‘linear’} Specifies the annealing strategy: “cos” for cosine annealing, “linear” for linear annealing. + Default: ‘cos’ + three_phase(bool): If True, use a third phase of the schedule to annihilate the learning rate according to ‘final_div_factor’ + instead of modifying the second phase (the first two phases will be symmetrical about the step indicated by ‘pct_start’). + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + max_lr, + epochs, + step_each_epoch, + anneal_strategy="cos", + three_phase=False, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(OneCycle, self).__init__() + self.max_lr = max_lr + self.epochs = epochs + self.steps_per_epoch = step_each_epoch + self.anneal_strategy = anneal_strategy + self.three_phase = three_phase + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = OneCycleDecay( + max_lr=self.max_lr, + epochs=self.epochs, + steps_per_epoch=self.steps_per_epoch, + anneal_strategy=self.anneal_strategy, + three_phase=self.three_phase, + last_epoch=self.last_epoch, + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.max_lr, + last_epoch=self.last_epoch, + ) + return learning_rate + + +class Const(object): + """ + Const learning rate decay + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, learning_rate, step_each_epoch, warmup_epoch=0, last_epoch=-1, **kwargs + ): + super(Const, self).__init__() + self.learning_rate = learning_rate + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = self.learning_rate + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch, + ) + return learning_rate + + +class DecayLearningRate(object): + """ + DecayLearningRate learning rate decay + new_lr = (lr - end_lr) * (1 - epoch/decay_steps)**power + end_lr + Args: + learning_rate(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + factor(float): Power of polynomial, should greater than 0.0 to get learning rate decay. Default: 0.9 + end_lr(float): The minimum final learning rate. Default: 0.0. + """ + + def __init__( + self, learning_rate, step_each_epoch, epochs, factor=0.9, end_lr=0, **kwargs + ): + super(DecayLearningRate, self).__init__() + self.learning_rate = learning_rate + self.epochs = epochs + 1 + self.factor = factor + self.end_lr = 0 + self.decay_steps = step_each_epoch * epochs + + def __call__(self): + learning_rate = lr.PolynomialDecay( + learning_rate=self.learning_rate, + decay_steps=self.decay_steps, + power=self.factor, + end_lr=self.end_lr, + ) + return learning_rate + + +class MultiStepDecay(object): + """ + Piecewise learning rate decay + Args: + step_each_epoch(int): steps each epoch + learning_rate (float): The initial learning rate. It is a python float number. + step_size (int): the interval to update. + gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` . + It should be less than 1.0. Default: 0.1. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + learning_rate, + milestones, + step_each_epoch, + gamma, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(MultiStepDecay, self).__init__() + self.milestones = [step_each_epoch * e for e in milestones] + self.learning_rate = learning_rate + self.gamma = gamma + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = lr.MultiStepDecay( + learning_rate=self.learning_rate, + milestones=self.milestones, + gamma=self.gamma, + last_epoch=self.last_epoch, + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch, + ) + return learning_rate + + +class TwoStepCosine(object): + """ + Cosine learning rate decay + lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1) + Args: + lr(float): initial learning rate + step_each_epoch(int): steps each epoch + epochs(int): total training epochs + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + """ + + def __init__( + self, + learning_rate, + step_each_epoch, + epochs, + warmup_epoch=0, + last_epoch=-1, + **kwargs, + ): + super(TwoStepCosine, self).__init__() + self.learning_rate = learning_rate + self.T_max1 = step_each_epoch * 200 + self.T_max2 = step_each_epoch * epochs + self.last_epoch = last_epoch + self.warmup_epoch = round(warmup_epoch * step_each_epoch) + + def __call__(self): + learning_rate = TwoStepCosineDecay( + learning_rate=self.learning_rate, + T_max1=self.T_max1, + T_max2=self.T_max2, + last_epoch=self.last_epoch, + ) + if self.warmup_epoch > 0: + learning_rate = lr.LinearWarmup( + learning_rate=learning_rate, + warmup_steps=self.warmup_epoch, + start_lr=0.0, + end_lr=self.learning_rate, + last_epoch=self.last_epoch, + ) + return learning_rate diff --git a/docling_ibm_models/slanet_1m/optimizer/lr_scheduler.py b/docling_ibm_models/slanet_1m/optimizer/lr_scheduler.py new file mode 100644 index 0000000..4034e14 --- /dev/null +++ b/docling_ibm_models/slanet_1m/optimizer/lr_scheduler.py @@ -0,0 +1,240 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from paddle.optimizer.lr import LRScheduler + + +class CyclicalCosineDecay(LRScheduler): + def __init__( + self, learning_rate, T_max, cycle=1, last_epoch=-1, eta_min=0.0, verbose=False + ): + """ + Cyclical cosine learning rate decay + A learning rate which can be referred in https://arxiv.org/pdf/2012.12645.pdf + Args: + learning rate(float): learning rate + T_max(int): maximum epoch num + cycle(int): period of the cosine decay + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + eta_min(float): minimum learning rate during training + verbose(bool): whether to print learning rate for each epoch + """ + super(CyclicalCosineDecay, self).__init__(learning_rate, last_epoch, verbose) + self.cycle = cycle + self.eta_min = eta_min + + def get_lr(self): + if self.last_epoch == 0: + return self.base_lr + reletive_epoch = self.last_epoch % self.cycle + lr = self.eta_min + 0.5 * (self.base_lr - self.eta_min) * ( + 1 + math.cos(math.pi * reletive_epoch / self.cycle) + ) + return lr + + +class OneCycleDecay(LRScheduler): + """ + One Cycle learning rate decay + A learning rate which can be referred in https://arxiv.org/abs/1708.07120 + Code refered in https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR + """ + + def __init__( + self, + max_lr, + epochs=None, + steps_per_epoch=None, + pct_start=0.3, + anneal_strategy="cos", + div_factor=25.0, + final_div_factor=1e4, + three_phase=False, + last_epoch=-1, + verbose=False, + ): + # Validate total_steps + if epochs <= 0 or not isinstance(epochs, int): + raise ValueError( + "Expected positive integer epochs, but got {}".format(epochs) + ) + if steps_per_epoch <= 0 or not isinstance(steps_per_epoch, int): + raise ValueError( + "Expected positive integer steps_per_epoch, but got {}".format( + steps_per_epoch + ) + ) + self.total_steps = epochs * steps_per_epoch + + self.max_lr = max_lr + self.initial_lr = self.max_lr / div_factor + self.min_lr = self.initial_lr / final_div_factor + + if three_phase: + self._schedule_phases = [ + { + "end_step": float(pct_start * self.total_steps) - 1, + "start_lr": self.initial_lr, + "end_lr": self.max_lr, + }, + { + "end_step": float(2 * pct_start * self.total_steps) - 2, + "start_lr": self.max_lr, + "end_lr": self.initial_lr, + }, + { + "end_step": self.total_steps - 1, + "start_lr": self.initial_lr, + "end_lr": self.min_lr, + }, + ] + else: + self._schedule_phases = [ + { + "end_step": float(pct_start * self.total_steps) - 1, + "start_lr": self.initial_lr, + "end_lr": self.max_lr, + }, + { + "end_step": self.total_steps - 1, + "start_lr": self.max_lr, + "end_lr": self.min_lr, + }, + ] + + # Validate pct_start + if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): + raise ValueError( + "Expected float between 0 and 1 pct_start, but got {}".format(pct_start) + ) + + # Validate anneal_strategy + if anneal_strategy not in ["cos", "linear"]: + raise ValueError( + "anneal_strategy must by one of 'cos' or 'linear', instead got {}".format( + anneal_strategy + ) + ) + elif anneal_strategy == "cos": + self.anneal_func = self._annealing_cos + elif anneal_strategy == "linear": + self.anneal_func = self._annealing_linear + + super(OneCycleDecay, self).__init__(max_lr, last_epoch, verbose) + + def _annealing_cos(self, start, end, pct): + "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." + cos_out = math.cos(math.pi * pct) + 1 + return end + (start - end) / 2.0 * cos_out + + def _annealing_linear(self, start, end, pct): + "Linearly anneal from `start` to `end` as pct goes from 0.0 to 1.0." + return (end - start) * pct + start + + def get_lr(self): + computed_lr = 0.0 + step_num = self.last_epoch + + if step_num > self.total_steps: + raise ValueError( + "Tried to step {} times. The specified number of total steps is {}".format( + step_num + 1, self.total_steps + ) + ) + start_step = 0 + for i, phase in enumerate(self._schedule_phases): + end_step = phase["end_step"] + if step_num <= end_step or i == len(self._schedule_phases) - 1: + pct = (step_num - start_step) / (end_step - start_step) + computed_lr = self.anneal_func(phase["start_lr"], phase["end_lr"], pct) + break + start_step = phase["end_step"] + + return computed_lr + + +class TwoStepCosineDecay(LRScheduler): + def __init__( + self, learning_rate, T_max1, T_max2, eta_min=0, last_epoch=-1, verbose=False + ): + if not isinstance(T_max1, int): + raise TypeError( + "The type of 'T_max1' in 'CosineAnnealingDecay' must be 'int', but received %s." + % type(T_max1) + ) + if not isinstance(T_max2, int): + raise TypeError( + "The type of 'T_max2' in 'CosineAnnealingDecay' must be 'int', but received %s." + % type(T_max2) + ) + if not isinstance(eta_min, (float, int)): + raise TypeError( + "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s." + % type(eta_min) + ) + assert T_max1 > 0 and isinstance( + T_max1, int + ), " 'T_max1' must be a positive integer." + assert T_max2 > 0 and isinstance( + T_max2, int + ), " 'T_max1' must be a positive integer." + self.T_max1 = T_max1 + self.T_max2 = T_max2 + self.eta_min = float(eta_min) + super(TwoStepCosineDecay, self).__init__(learning_rate, last_epoch, verbose) + + def get_lr(self): + if self.last_epoch <= self.T_max1: + if self.last_epoch == 0: + return self.base_lr + elif (self.last_epoch - 1 - self.T_max1) % (2 * self.T_max1) == 0: + return ( + self.last_lr + + (self.base_lr - self.eta_min) + * (1 - math.cos(math.pi / self.T_max1)) + / 2 + ) + + return (1 + math.cos(math.pi * self.last_epoch / self.T_max1)) / ( + 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max1) + ) * (self.last_lr - self.eta_min) + self.eta_min + else: + if (self.last_epoch - 1 - self.T_max2) % (2 * self.T_max2) == 0: + return ( + self.last_lr + + (self.base_lr - self.eta_min) + * (1 - math.cos(math.pi / self.T_max2)) + / 2 + ) + + return (1 + math.cos(math.pi * self.last_epoch / self.T_max2)) / ( + 1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max2) + ) * (self.last_lr - self.eta_min) + self.eta_min + + def _get_closed_form_lr(self): + if self.last_epoch <= self.T_max1: + return ( + self.eta_min + + (self.base_lr - self.eta_min) + * (1 + math.cos(math.pi * self.last_epoch / self.T_max1)) + / 2 + ) + else: + return ( + self.eta_min + + (self.base_lr - self.eta_min) + * (1 + math.cos(math.pi * self.last_epoch / self.T_max2)) + / 2 + ) diff --git a/docling_ibm_models/slanet_1m/optimizer/optimizer.py b/docling_ibm_models/slanet_1m/optimizer/optimizer.py new file mode 100644 index 0000000..d7f78a5 --- /dev/null +++ b/docling_ibm_models/slanet_1m/optimizer/optimizer.py @@ -0,0 +1,292 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from paddle import optimizer as optim + + +class Momentum(object): + """ + Simple Momentum optimizer with velocity state. + Args: + learning_rate (float|Variable) - The learning rate used to update parameters. + Can be a float value or a Variable with one float value as data element. + momentum (float) - Momentum factor. + regularization (WeightDecayRegularizer, optional) - The strategy of regularization. + """ + + def __init__( + self, learning_rate, momentum, weight_decay=None, grad_clip=None, **args + ): + super(Momentum, self).__init__() + self.learning_rate = learning_rate + self.momentum = momentum + self.weight_decay = weight_decay + self.grad_clip = grad_clip + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.Momentum( + learning_rate=self.learning_rate, + momentum=self.momentum, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + parameters=train_params, + ) + return opt + + +class Adam(object): + def __init__( + self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + parameter_list=None, + weight_decay=None, + grad_clip=None, + name=None, + lazy_mode=False, + **kwargs, + ): + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.parameter_list = parameter_list + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.grad_clip = grad_clip + self.name = name + self.lazy_mode = lazy_mode + self.group_lr = kwargs.get("group_lr", False) + self.training_step = kwargs.get("training_step", None) + + def __call__(self, model): + if self.group_lr: + if self.training_step == "LF_2": + import paddle + + if isinstance(model, paddle.DataParallel): # multi gpu + mlm = model._layers.head.MLM_VRM.MLM.parameters() + pre_mlm_pp = ( + model._layers.head.MLM_VRM.Prediction.pp_share.parameters() + ) + pre_mlm_w = ( + model._layers.head.MLM_VRM.Prediction.w_share.parameters() + ) + else: # single gpu + mlm = model.head.MLM_VRM.MLM.parameters() + pre_mlm_pp = model.head.MLM_VRM.Prediction.pp_share.parameters() + pre_mlm_w = model.head.MLM_VRM.Prediction.w_share.parameters() + + total = [] + for param in mlm: + total.append(id(param)) + for param in pre_mlm_pp: + total.append(id(param)) + for param in pre_mlm_w: + total.append(id(param)) + + group_base_params = [ + param for param in model.parameters() if id(param) in total + ] + group_small_params = [ + param for param in model.parameters() if id(param) not in total + ] + train_params = [ + {"params": group_base_params}, + { + "params": group_small_params, + "learning_rate": self.learning_rate.values[0] * 0.1, + }, + ] + + else: + print("group lr currently only support VisionLAN in LF_2 training step") + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + else: + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + + opt = optim.Adam( + learning_rate=self.learning_rate, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + name=self.name, + lazy_mode=self.lazy_mode, + parameters=train_params, + ) + return opt + + +class RMSProp(object): + """ + Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method. + Args: + learning_rate (float|Variable) - The learning rate used to update parameters. + Can be a float value or a Variable with one float value as data element. + momentum (float) - Momentum factor. + rho (float) - rho value in equation. + epsilon (float) - avoid division by zero, default is 1e-6. + regularization (WeightDecayRegularizer, optional) - The strategy of regularization. + """ + + def __init__( + self, + learning_rate, + momentum=0.0, + rho=0.95, + epsilon=1e-6, + weight_decay=None, + grad_clip=None, + **args, + ): + super(RMSProp, self).__init__() + self.learning_rate = learning_rate + self.momentum = momentum + self.rho = rho + self.epsilon = epsilon + self.weight_decay = weight_decay + self.grad_clip = grad_clip + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.RMSProp( + learning_rate=self.learning_rate, + momentum=self.momentum, + rho=self.rho, + epsilon=self.epsilon, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + parameters=train_params, + ) + return opt + + +class Adadelta(object): + def __init__( + self, + learning_rate=0.001, + epsilon=1e-08, + rho=0.95, + parameter_list=None, + weight_decay=None, + grad_clip=None, + name=None, + **kwargs, + ): + self.learning_rate = learning_rate + self.epsilon = epsilon + self.rho = rho + self.parameter_list = parameter_list + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.grad_clip = grad_clip + self.name = name + + def __call__(self, model): + train_params = [ + param for param in model.parameters() if param.trainable is True + ] + opt = optim.Adadelta( + learning_rate=self.learning_rate, + epsilon=self.epsilon, + rho=self.rho, + weight_decay=self.weight_decay, + grad_clip=self.grad_clip, + name=self.name, + parameters=train_params, + ) + return opt + + +class AdamW(object): + def __init__( + self, + learning_rate=0.001, + beta1=0.9, + beta2=0.999, + epsilon=1e-8, + weight_decay=0.01, + multi_precision=False, + grad_clip=None, + no_weight_decay_name=None, + one_dim_param_no_weight_decay=False, + name=None, + lazy_mode=False, + **args, + ): + super().__init__() + self.learning_rate = learning_rate + self.beta1 = beta1 + self.beta2 = beta2 + self.epsilon = epsilon + self.grad_clip = grad_clip + self.weight_decay = 0.01 if weight_decay is None else weight_decay + self.grad_clip = grad_clip + self.name = name + self.lazy_mode = lazy_mode + self.multi_precision = multi_precision + self.no_weight_decay_name_list = ( + no_weight_decay_name.split() if no_weight_decay_name else [] + ) + self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay + + def __call__(self, model): + parameters = [param for param in model.parameters() if param.trainable is True] + + self.no_weight_decay_param_name_list = [ + p.name + for n, p in model.named_parameters() + if any(nd in n for nd in self.no_weight_decay_name_list) + ] + + if self.one_dim_param_no_weight_decay: + self.no_weight_decay_param_name_list += [ + p.name for n, p in model.named_parameters() if len(p.shape) == 1 + ] + + opt = optim.AdamW( + learning_rate=self.learning_rate, + beta1=self.beta1, + beta2=self.beta2, + epsilon=self.epsilon, + parameters=parameters, + weight_decay=self.weight_decay, + multi_precision=self.multi_precision, + grad_clip=self.grad_clip, + name=self.name, + lazy_mode=self.lazy_mode, + apply_decay_param_fun=self._apply_decay_param_fun, + ) + return opt + + def _apply_decay_param_fun(self, name): + return name not in self.no_weight_decay_param_name_list diff --git a/docling_ibm_models/slanet_1m/optimizer/regularizer.py b/docling_ibm_models/slanet_1m/optimizer/regularizer.py new file mode 100644 index 0000000..740ad1c --- /dev/null +++ b/docling_ibm_models/slanet_1m/optimizer/regularizer.py @@ -0,0 +1,51 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import paddle + + +class L1Decay(object): + """ + L1 Weight Decay Regularization, which encourages the weights to be sparse. + Args: + factor(float): regularization coeff. Default:0.0. + """ + + def __init__(self, factor=0.0): + super(L1Decay, self).__init__() + self.coeff = factor + + def __call__(self): + reg = paddle.regularizer.L1Decay(self.coeff) + return reg + + +class L2Decay(object): + """ + L2 Weight Decay Regularization, which helps to prevent the model over-fitting. + Args: + factor(float): regularization coeff. Default:0.0. + """ + + def __init__(self, factor=0.0): + super(L2Decay, self).__init__() + self.coeff = float(factor) + + def __call__(self): + return self.coeff diff --git a/docling_ibm_models/slanet_1m/predict_table.py b/docling_ibm_models/slanet_1m/predict_table.py new file mode 100644 index 0000000..814cad4 --- /dev/null +++ b/docling_ibm_models/slanet_1m/predict_table.py @@ -0,0 +1,254 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, ".."))) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "../.."))) + +os.environ["FLAGS_allocator_strategy"] = "auto_growth" +import cv2 +import copy +import logging +import numpy as np +import time +import paddleocr.tools.infer.predict_rec as predict_rec +import paddleocr.tools.infer.predict_det as predict_det +import paddleocr.tools.infer.utility as utility +from paddleocr.tools.infer.predict_system import sorted_boxes +from paddleocr.ppocr.utils.utility import get_image_file_list, check_and_read +from paddleocr.ppocr.utils.logging import get_logger +from paddleocr.ppstructure.table.matcher import TableMatch +from paddleocr.ppstructure.table.table_master_match import TableMasterMatcher +from paddleocr.ppstructure.utility import parse_args +import paddleocr.ppstructure.table.predict_structure as predict_strture + +logger = get_logger() + + +def expand(pix, det_box, shape): + x0, y0, x1, y1 = det_box + # print(shape) + h, w, c = shape + tmp_x0 = x0 - pix + tmp_x1 = x1 + pix + tmp_y0 = y0 - pix + tmp_y1 = y1 + pix + x0_ = tmp_x0 if tmp_x0 >= 0 else 0 + x1_ = tmp_x1 if tmp_x1 <= w else w + y0_ = tmp_y0 if tmp_y0 >= 0 else 0 + y1_ = tmp_y1 if tmp_y1 <= h else h + return x0_, y0_, x1_, y1_ + + +class TableSystem(object): + def __init__(self, args, text_detector=None, text_recognizer=None): + self.args = args + if not args.show_log: + logger.setLevel(logging.INFO) + benchmark_tmp = False + if args.benchmark: + benchmark_tmp = args.benchmark + args.benchmark = False + self.text_detector = ( + predict_det.TextDetector(copy.deepcopy(args)) + if text_detector is None + else text_detector + ) + self.text_recognizer = ( + predict_rec.TextRecognizer(copy.deepcopy(args)) + if text_recognizer is None + else text_recognizer + ) + if benchmark_tmp: + args.benchmark = True + self.table_structurer = predict_strture.TableStructurer(args) + if args.table_algorithm in ["TableMaster"]: + self.match = TableMasterMatcher() + else: + self.match = TableMatch(filter_ocr_result=True) + + ( + self.predictor, + self.input_tensor, + self.output_tensors, + self.config, + ) = utility.create_predictor(args, "table", logger) + + def __call__(self, img, return_ocr_result_in_table=False): + result = dict() + time_dict = {"det": 0, "rec": 0, "table": 0, "all": 0, "match": 0} + start = time.time() + structure_res, elapse = self._structure(copy.deepcopy(img)) + result["cell_bbox"] = structure_res[1].tolist() + time_dict["table"] = elapse + + dt_boxes, rec_res, det_elapse, rec_elapse = self._ocr(copy.deepcopy(img)) + time_dict["det"] = det_elapse + time_dict["rec"] = rec_elapse + + if return_ocr_result_in_table: + result["boxes"] = [x.tolist() for x in dt_boxes] + result["rec_res"] = rec_res + + tic = time.time() + pred_html = self.match(structure_res, dt_boxes, rec_res) + toc = time.time() + time_dict["match"] = toc - tic + result["html"] = pred_html + end = time.time() + time_dict["all"] = end - start + return result, time_dict + + def _structure(self, img): + structure_res, elapse = self.table_structurer(copy.deepcopy(img)) + return structure_res, elapse + + def _ocr(self, img): + h, w = img.shape[:2] + dt_boxes, det_elapse = self.text_detector(copy.deepcopy(img)) + dt_boxes = sorted_boxes(dt_boxes) + + r_boxes = [] + for box in dt_boxes: + x_min = max(0, box[:, 0].min() - 1) + x_max = min(w, box[:, 0].max() + 1) + y_min = max(0, box[:, 1].min() - 1) + y_max = min(h, box[:, 1].max() + 1) + box = [x_min, y_min, x_max, y_max] + r_boxes.append(box) + dt_boxes = np.array(r_boxes) + logger.debug("dt_boxes num : {}, elapse : {}".format(len(dt_boxes), det_elapse)) + if dt_boxes is None: + return None, None + + img_crop_list = [] + for i in range(len(dt_boxes)): + det_box = dt_boxes[i] + x0, y0, x1, y1 = expand(2, det_box, img.shape) + text_rect = img[int(y0) : int(y1), int(x0) : int(x1), :] + img_crop_list.append(text_rect) + rec_res, rec_elapse = self.text_recognizer(img_crop_list) + logger.debug("rec_res num : {}, elapse : {}".format(len(rec_res), rec_elapse)) + return dt_boxes, rec_res, det_elapse, rec_elapse + + +def to_excel(html_table, excel_path): + from tablepyxl import tablepyxl + + tablepyxl.document_to_xl(html_table, excel_path) + + +def main(args): + image_file_list = get_image_file_list(args.image_dir) + image_file_list = image_file_list[args.process_id :: args.total_process_num] + os.makedirs(args.output, exist_ok=True) + + table_sys = TableSystem(args) + img_num = len(image_file_list) + + f_html = open(os.path.join(args.output, "show.html"), mode="w", encoding="utf-8") + f_html.write("\n\n") + f_html.write('\n') + f_html.write( + '' + ) + f_html.write("\n") + f_html.write("") + f_html.write("") + f_html.write("") + f_html.write("\n") + + for i, image_file in enumerate(image_file_list): + logger.info("[{}/{}] {}".format(i, img_num, image_file)) + img, flag, _ = check_and_read(image_file) + excel_path = os.path.join( + args.output, os.path.basename(image_file).split(".")[0] + ".xlsx" + ) + if not flag: + img = cv2.imread(image_file) + if img is None: + logger.error("error in loading image:{}".format(image_file)) + continue + starttime = time.time() + pred_res, _ = table_sys(img) + pred_html = pred_res["html"] + logger.info(pred_html) + to_excel(pred_html, excel_path) + logger.info("excel saved to {}".format(excel_path)) + elapse = time.time() - starttime + logger.info("Predict time : {:.3f}s".format(elapse)) + + if len(pred_res["cell_bbox"]) > 0 and len(pred_res["cell_bbox"][0]) == 4: + img = predict_strture.draw_rectangle(image_file, pred_res["cell_bbox"]) + else: + img = utility.draw_boxes(img, pred_res["cell_bbox"]) + img_save_path = os.path.join(args.output, os.path.basename(image_file)) + cv2.imwrite(img_save_path, img) + + f_html.write("\n") + f_html.write(f"\n') + f_html.write( + '
img name\n") + f_html.write("ori imagetable htmlcell box
{os.path.basename(image_file)}
\n") + f_html.write(f'
' + + pred_html.replace("
", "").replace( + "
", "" + ) + + "
\n" + ) + f_html.write(f'\n') + f_html.write("\n") + f_html.write("\n") + f_html.close() + + if args.benchmark: + table_sys.table_structurer.autolog.report() + + +if __name__ == "__main__": + args = parse_args() + args.image_dir = r"12_tables" + + #args.det_model_dir = "...\en_ppocr_mobile_v2.0_table_det_infer" + args.det_model_dir = "inference_table/en_PP-OCRv3_det_infer" + #args.rec_model_dir = "...\en_ppocr_mobile_v2.0_table_rec_infer" + args.rec_model_dir = "inference_table/en_PP-OCRv3_rec_infer" + args.table_model_dir = "model_final" + #args.table_model_dir = "...\Desktop\model_final" + args.rec_char_dict_path = "dict_table/en_dict.txt" + args.table_char_dict_path = "dict_table/table_structure_dict.txt" + args.font_path = r'\Fonts\Arial.ttf' + + args.output = "output" + if args.use_mp: + import subprocess + + p_list = [] + total_process_num = args.total_process_num + for process_id in range(total_process_num): + cmd = ( + [sys.executable, "-u"] + + sys.argv + + ["--process_id={}".format(process_id), "--use_mp={}".format(False)] + ) + p = subprocess.Popen(cmd, stdout=sys.stdout, stderr=sys.stdout) + p_list.append(p) + for p in p_list: + p.wait() + else: + main(args) diff --git a/docling_ibm_models/slanet_1m/program.py b/docling_ibm_models/slanet_1m/program.py new file mode 100644 index 0000000..daa6fa6 --- /dev/null +++ b/docling_ibm_models/slanet_1m/program.py @@ -0,0 +1,826 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import platform +import yaml +import time +import datetime +import paddle +import paddle.distributed as dist +from tqdm import tqdm +import cv2 +import numpy as np +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +from paddleocr.ppocr.utils.stats import TrainingStats +from paddleocr.ppocr.utils.save_load import save_model +from paddleocr.ppocr.utils.utility import print_dict, AverageMeter +from paddleocr.ppocr.utils.logging import get_logger +from paddleocr.ppocr.utils.loggers import WandbLogger, Loggers +from paddleocr.ppocr.utils import profiler +from paddleocr.ppocr.data import build_dataloader + + +class ArgsParser(ArgumentParser): + def __init__(self): + super(ArgsParser, self).__init__(formatter_class=RawDescriptionHelpFormatter) + self.add_argument("-c", "--config", help="configuration file to use") + self.add_argument("-o", "--opt", nargs="+", help="set configuration options") + self.add_argument( + "-p", + "--profiler_options", + type=str, + default=None, + help="The option of profiler, which should be in format " + '"key1=value1;key2=value2;key3=value3".', + ) + + def parse_args(self, argv=None): + args = super(ArgsParser, self).parse_args(argv) + assert args.config is not None, "Please specify --config=configure_file_path." + args.opt = self._parse_opt(args.opt) + return args + + def _parse_opt(self, opts): + config = {} + if not opts: + return config + for s in opts: + s = s.strip() + k, v = s.split("=") + config[k] = yaml.load(v, Loader=yaml.Loader) + return config + + +def load_config(file_path): + """ + Load config from yml/yaml file. + Args: + file_path (str): Path of the config file to be loaded. + Returns: global config + """ + _, ext = os.path.splitext(file_path) + assert ext in [".yml", ".yaml"], "only support yaml files for now" + config = yaml.load(open(file_path, "rb"), Loader=yaml.Loader) + return config + + +def merge_config(config, opts): + """ + Merge config into global config. + Args: + config (dict): Config to be merged. + Returns: global config + """ + for key, value in opts.items(): + if "." not in key: + if isinstance(value, dict) and key in config: + config[key].update(value) + else: + config[key] = value + else: + sub_keys = key.split(".") + assert sub_keys[0] in config, ( + "the sub_keys can only be one of global_config: {}, but get: " + "{}, please check your running command".format( + config.keys(), sub_keys[0] + ) + ) + cur = config[sub_keys[0]] + for idx, sub_key in enumerate(sub_keys[1:]): + if idx == len(sub_keys) - 2: + cur[sub_key] = value + else: + cur = cur[sub_key] + return config + + +def check_device(use_gpu, use_xpu=False, use_npu=False, use_mlu=False): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + err = ( + "Config {} cannot be set as true while your paddle " + "is not compiled with {} ! \nPlease try: \n" + "\t1. Install paddlepaddle to run model on {} \n" + "\t2. Set {} as false in config file to run " + "model on CPU" + ) + + try: + if use_gpu and use_xpu: + print("use_xpu and use_gpu can not both be true.") + if use_gpu and not paddle.is_compiled_with_cuda(): + print(err.format("use_gpu", "cuda", "gpu", "use_gpu")) + sys.exit(1) + if use_xpu and not paddle.device.is_compiled_with_xpu(): + print(err.format("use_xpu", "xpu", "xpu", "use_xpu")) + sys.exit(1) + if use_npu: + if ( + int(paddle.version.major) != 0 + and int(paddle.version.major) <= 2 + and int(paddle.version.minor) <= 4 + ): + if not paddle.device.is_compiled_with_npu(): + print(err.format("use_npu", "npu", "npu", "use_npu")) + sys.exit(1) + # is_compiled_with_npu() has been updated after paddle-2.4 + else: + if not paddle.device.is_compiled_with_custom_device("npu"): + print(err.format("use_npu", "npu", "npu", "use_npu")) + sys.exit(1) + if use_mlu and not paddle.device.is_compiled_with_mlu(): + print(err.format("use_mlu", "mlu", "mlu", "use_mlu")) + sys.exit(1) + except Exception as e: + pass + + +def to_float32(preds): + if isinstance(preds, dict): + for k in preds: + if isinstance(preds[k], dict) or isinstance(preds[k], list): + preds[k] = to_float32(preds[k]) + elif isinstance(preds[k], paddle.Tensor): + preds[k] = preds[k].astype(paddle.float32) + elif isinstance(preds, list): + for k in range(len(preds)): + if isinstance(preds[k], dict): + preds[k] = to_float32(preds[k]) + elif isinstance(preds[k], list): + preds[k] = to_float32(preds[k]) + elif isinstance(preds[k], paddle.Tensor): + preds[k] = preds[k].astype(paddle.float32) + elif isinstance(preds, paddle.Tensor): + preds = preds.astype(paddle.float32) + return preds + + +def train( + config, + train_dataloader, + valid_dataloader, + device, + model, + loss_class, + optimizer, + lr_scheduler, + post_process_class, + eval_class, + pre_best_model_dict, + logger, + step_pre_epoch, + log_writer=None, + scaler=None, + amp_level="O2", + amp_custom_black_list=[], + amp_custom_white_list=[], + amp_dtype="float16", +): + cal_metric_during_train = config["Global"].get("cal_metric_during_train", False) + calc_epoch_interval = config["Global"].get("calc_epoch_interval", 1) + log_smooth_window = config["Global"]["log_smooth_window"] + epoch_num = config["Global"]["epoch_num"] + print_batch_step = config["Global"]["print_batch_step"] + eval_batch_step = config["Global"]["eval_batch_step"] + eval_batch_epoch = config["Global"].get("eval_batch_epoch", None) + profiler_options = config["profiler_options"] + + global_step = 0 + if "global_step" in pre_best_model_dict: + global_step = pre_best_model_dict["global_step"] + start_eval_step = 0 + if isinstance(eval_batch_step, list) and len(eval_batch_step) >= 2: + start_eval_step = eval_batch_step[0] if not eval_batch_epoch else 0 + eval_batch_step = ( + eval_batch_step[1] + if not eval_batch_epoch + else step_pre_epoch * eval_batch_epoch + ) + if len(valid_dataloader) == 0: + logger.info( + "No Images in eval dataset, evaluation during training " + "will be disabled" + ) + start_eval_step = 1e111 + logger.info( + "During the training process, after the {}th iteration, " + "an evaluation is run every {} iterations".format( + start_eval_step, eval_batch_step + ) + ) + save_epoch_step = config["Global"]["save_epoch_step"] + save_model_dir = config["Global"]["save_model_dir"] + if not os.path.exists(save_model_dir): + os.makedirs(save_model_dir) + main_indicator = eval_class.main_indicator + best_model_dict = {main_indicator: 0} + best_model_dict.update(pre_best_model_dict) + train_stats = TrainingStats(log_smooth_window, ["lr"]) + model_average = False + model.train() + + use_srn = config["Architecture"]["algorithm"] == "SRN" + extra_input_models = [ + "SRN", + "NRTR", + "SAR", + "SEED", + "SVTR", + "SVTR_LCNet", + "SPIN", + "VisionLAN", + "RobustScanner", + "RFL", + "DRRG", + "SATRN", + "SVTR_HGNet", + "ParseQ", + "CPPD", + ] + extra_input = False + if config["Architecture"]["algorithm"] == "Distillation": + for key in config["Architecture"]["Models"]: + extra_input = ( + extra_input + or config["Architecture"]["Models"][key]["algorithm"] + in extra_input_models + ) + else: + extra_input = config["Architecture"]["algorithm"] in extra_input_models + try: + model_type = config["Architecture"]["model_type"] + except: + model_type = None + + algorithm = config["Architecture"]["algorithm"] + + start_epoch = ( + best_model_dict["start_epoch"] if "start_epoch" in best_model_dict else 1 + ) + + total_samples = 0 + train_reader_cost = 0.0 + train_batch_cost = 0.0 + reader_start = time.time() + eta_meter = AverageMeter() + + max_iter = ( + len(train_dataloader) - 1 + if platform.system() == "Windows" + else len(train_dataloader) + ) + + for epoch in range(start_epoch, epoch_num + 1): + if train_dataloader.dataset.need_reset: + train_dataloader = build_dataloader( + config, "Train", device, logger, seed=epoch + ) + max_iter = ( + len(train_dataloader) - 1 + if platform.system() == "Windows" + else len(train_dataloader) + ) + + for idx, batch in enumerate(train_dataloader): + profiler.add_profiler_step(profiler_options) + train_reader_cost += time.time() - reader_start + if idx >= max_iter: + break + lr = optimizer.get_lr() + images = batch[0] + if use_srn: + model_average = True + # use amp + if scaler: + with paddle.amp.auto_cast( + level=amp_level, + custom_black_list=amp_custom_black_list, + custom_white_list=amp_custom_white_list, + dtype=amp_dtype, + ): + if model_type == "table" or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif algorithm in ["CAN"]: + preds = model(batch[:3]) + else: + preds = model(images) + preds = to_float32(preds) + loss = loss_class(preds, batch) + avg_loss = loss["loss"] + scaled_avg_loss = scaler.scale(avg_loss) + scaled_avg_loss.backward() + scaler.minimize(optimizer, scaled_avg_loss) + else: + if model_type == "table" or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie", "sr"]: + preds = model(batch) + elif algorithm in ["CAN"]: + preds = model(batch[:3]) + else: + preds = model(images) + loss = loss_class(preds, batch) + avg_loss = loss["loss"] + avg_loss.backward() + optimizer.step() + + optimizer.clear_grad() + + if ( + cal_metric_during_train and epoch % calc_epoch_interval == 0 + ): # only rec and cls need + batch = [item.numpy() for item in batch] + if model_type in ["kie", "sr"]: + eval_class(preds, batch) + elif model_type in ["table"]: + post_result = post_process_class(preds, batch) + eval_class(post_result, batch) + elif algorithm in ["CAN"]: + model_type = "can" + eval_class(preds[0], batch[2:], epoch_reset=(idx == 0)) + else: + if config["Loss"]["name"] in [ + "MultiLoss", + "MultiLoss_v2", + ]: # for multi head loss + post_result = post_process_class( + preds["ctc"], batch[1] + ) # for CTC head out + elif config["Loss"]["name"] in ["VLLoss"]: + post_result = post_process_class(preds, batch[1], batch[-1]) + else: + post_result = post_process_class(preds, batch[1]) + eval_class(post_result, batch) + metric = eval_class.get_metric() + train_stats.update(metric) + + train_batch_time = time.time() - reader_start + train_batch_cost += train_batch_time + eta_meter.update(train_batch_time) + global_step += 1 + total_samples += len(images) + + if not isinstance(lr_scheduler, float): + lr_scheduler.step() + + # logger and visualdl + stats = { + k: float(v) if v.shape == [] else v.numpy().mean() + for k, v in loss.items() + } + stats["lr"] = lr + train_stats.update(stats) + + if log_writer is not None and dist.get_rank() == 0: + log_writer.log_metrics( + metrics=train_stats.get(), prefix="TRAIN", step=global_step + ) + + if dist.get_rank() == 0 and ( + (global_step > 0 and global_step % print_batch_step == 0) + or (idx >= len(train_dataloader) - 1) + ): + logs = train_stats.log() + + eta_sec = ( + (epoch_num + 1 - epoch) * len(train_dataloader) - idx - 1 + ) * eta_meter.avg + eta_sec_format = str(datetime.timedelta(seconds=int(eta_sec))) + max_mem_reserved_str = "" + max_mem_allocated_str = "" + if paddle.device.is_compiled_with_cuda(): + max_mem_reserved_str = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB," + max_mem_allocated_str = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB" + strs = ( + "epoch: [{}/{}], global_step: {}, {}, avg_reader_cost: " + "{:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, " + "ips: {:.5f} samples/s, eta: {}, {} {}".format( + epoch, + epoch_num, + global_step, + logs, + train_reader_cost / print_batch_step, + train_batch_cost / print_batch_step, + total_samples / print_batch_step, + total_samples / train_batch_cost, + eta_sec_format, + max_mem_reserved_str, + max_mem_allocated_str, + ) + ) + logger.info(strs) + + total_samples = 0 + train_reader_cost = 0.0 + train_batch_cost = 0.0 + # eval + if ( + global_step > start_eval_step + and (global_step - start_eval_step) % eval_batch_step == 0 + and dist.get_rank() == 0 + ): + if model_average: + Model_Average = paddle.incubate.optimizer.ModelAverage( + 0.15, + parameters=model.parameters(), + min_average_window=10000, + max_average_window=15625, + ) + Model_Average.apply() + cur_metric = eval( + model, + valid_dataloader, + post_process_class, + eval_class, + model_type, + extra_input=extra_input, + scaler=scaler, + amp_level=amp_level, + amp_custom_black_list=amp_custom_black_list, + amp_custom_white_list=amp_custom_white_list, + amp_dtype=amp_dtype, + ) + cur_metric_str = "cur metric, {}".format( + ", ".join(["{}: {}".format(k, v) for k, v in cur_metric.items()]) + ) + logger.info(cur_metric_str) + + # logger metric + if log_writer is not None: + log_writer.log_metrics( + metrics=cur_metric, prefix="EVAL", step=global_step + ) + + if cur_metric[main_indicator] >= best_model_dict[main_indicator]: + best_model_dict.update(cur_metric) + best_model_dict["best_epoch"] = epoch + save_model( + model, + optimizer, + save_model_dir, + logger, + config, + is_best=True, + prefix="best_accuracy", + best_model_dict=best_model_dict, + epoch=epoch, + global_step=global_step, + ) + best_str = "best metric, {}".format( + ", ".join( + ["{}: {}".format(k, v) for k, v in best_model_dict.items()] + ) + ) + logger.info(best_str) + # logger best metric + if log_writer is not None: + log_writer.log_metrics( + metrics={ + "best_{}".format(main_indicator): best_model_dict[ + main_indicator + ] + }, + prefix="EVAL", + step=global_step, + ) + + log_writer.log_model( + is_best=True, prefix="best_accuracy", metadata=best_model_dict + ) + + reader_start = time.time() + if dist.get_rank() == 0: + save_model( + model, + optimizer, + save_model_dir, + logger, + config, + is_best=False, + prefix="latest", + best_model_dict=best_model_dict, + epoch=epoch, + global_step=global_step, + ) + + if log_writer is not None: + log_writer.log_model(is_best=False, prefix="latest") + + if dist.get_rank() == 0 and epoch > 0 and epoch % save_epoch_step == 0: + save_model( + model, + optimizer, + save_model_dir, + logger, + config, + is_best=False, + prefix="iter_epoch_{}".format(epoch), + best_model_dict=best_model_dict, + epoch=epoch, + global_step=global_step, + ) + if log_writer is not None: + log_writer.log_model( + is_best=False, prefix="iter_epoch_{}".format(epoch) + ) + + best_str = "best metric, {}".format( + ", ".join(["{}: {}".format(k, v) for k, v in best_model_dict.items()]) + ) + logger.info(best_str) + if dist.get_rank() == 0 and log_writer is not None: + log_writer.close() + return + + +def eval( + model, + valid_dataloader, + post_process_class, + eval_class, + model_type=None, + extra_input=False, + scaler=None, + amp_level="O2", + amp_custom_black_list=[], + amp_custom_white_list=[], + amp_dtype="float16", +): + model.eval() + with paddle.no_grad(): + total_frame = 0.0 + total_time = 0.0 + pbar = tqdm( + total=len(valid_dataloader), desc="eval model:", position=0, leave=True + ) + max_iter = ( + len(valid_dataloader) - 1 + if platform.system() == "Windows" + else len(valid_dataloader) + ) + sum_images = 0 + for idx, batch in enumerate(valid_dataloader): + if idx >= max_iter: + break + images = batch[0] + start = time.time() + + # use amp + if scaler: + with paddle.amp.auto_cast( + level=amp_level, + custom_black_list=amp_custom_black_list, + dtype=amp_dtype, + ): + if model_type == "table" or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif model_type in ["can"]: + preds = model(batch[:3]) + elif model_type in ["sr"]: + preds = model(batch) + sr_img = preds["sr_img"] + lr_img = preds["lr_img"] + else: + preds = model(images) + preds = to_float32(preds) + else: + if model_type == "table" or extra_input: + preds = model(images, data=batch[1:]) + elif model_type in ["kie"]: + preds = model(batch) + elif model_type in ["can"]: + preds = model(batch[:3]) + elif model_type in ["sr"]: + preds = model(batch) + sr_img = preds["sr_img"] + lr_img = preds["lr_img"] + else: + preds = model(images) + + batch_numpy = [] + for item in batch: + if isinstance(item, paddle.Tensor): + batch_numpy.append(item.numpy()) + else: + batch_numpy.append(item) + # Obtain usable results from post-processing methods + total_time += time.time() - start + # Evaluate the results of the current batch + if model_type in ["table", "kie"]: + if post_process_class is None: + eval_class(preds, batch_numpy) + else: + post_result = post_process_class(preds, batch_numpy) + eval_class(post_result, batch_numpy) + elif model_type in ["sr"]: + eval_class(preds, batch_numpy) + elif model_type in ["can"]: + eval_class(preds[0], batch_numpy[2:], epoch_reset=(idx == 0)) + else: + post_result = post_process_class(preds, batch_numpy[1]) + eval_class(post_result, batch_numpy) + + pbar.update(1) + total_frame += len(images) + sum_images += 1 + # Get final metric,eg. acc or hmean + metric = eval_class.get_metric() + + pbar.close() + model.train() + metric["fps"] = total_frame / total_time + return metric + + +def update_center(char_center, post_result, preds): + result, label = post_result + feats, logits = preds + logits = paddle.argmax(logits, axis=-1) + feats = feats.numpy() + logits = logits.numpy() + + for idx_sample in range(len(label)): + if result[idx_sample][0] == label[idx_sample][0]: + feat = feats[idx_sample] + logit = logits[idx_sample] + for idx_time in range(len(logit)): + index = logit[idx_time] + if index in char_center.keys(): + char_center[index][0] = ( + char_center[index][0] * char_center[index][1] + feat[idx_time] + ) / (char_center[index][1] + 1) + char_center[index][1] += 1 + else: + char_center[index] = [feat[idx_time], 1] + return char_center + + +def get_center(model, eval_dataloader, post_process_class): + pbar = tqdm(total=len(eval_dataloader), desc="get center:") + max_iter = ( + len(eval_dataloader) - 1 + if platform.system() == "Windows" + else len(eval_dataloader) + ) + char_center = dict() + for idx, batch in enumerate(eval_dataloader): + if idx >= max_iter: + break + images = batch[0] + start = time.time() + preds = model(images) + + batch = [item.numpy() for item in batch] + # Obtain usable results from post-processing methods + post_result = post_process_class(preds, batch[1]) + + # update char_center + char_center = update_center(char_center, post_result, preds) + pbar.update(1) + + pbar.close() + for key in char_center.keys(): + char_center[key] = char_center[key][0] + return char_center + + +def preprocess(is_train=False): + FLAGS = ArgsParser().parse_args() + profiler_options = FLAGS.profiler_options + config = load_config(FLAGS.config) + config = merge_config(config, FLAGS.opt) + profile_dic = {"profiler_options": FLAGS.profiler_options} + config = merge_config(config, profile_dic) + + if is_train: + # save_config + save_model_dir = config["Global"]["save_model_dir"] + os.makedirs(save_model_dir, exist_ok=True) + with open(os.path.join(save_model_dir, "config.yml"), "w") as f: + yaml.dump(dict(config), f, default_flow_style=False, sort_keys=False) + log_file = "{}/train.log".format(save_model_dir) + else: + log_file = None + logger = get_logger(log_file=log_file) + + # check if set use_gpu=True in paddlepaddle cpu version + use_gpu = config["Global"].get("use_gpu", False) + use_xpu = config["Global"].get("use_xpu", False) + use_npu = config["Global"].get("use_npu", False) + use_mlu = config["Global"].get("use_mlu", False) + + alg = config["Architecture"]["algorithm"] + assert alg in [ + "EAST", + "DB", + "SAST", + "Rosetta", + "CRNN", + "STARNet", + "RARE", + "SRN", + "CLS", + "PGNet", + "Distillation", + "NRTR", + "TableAttn", + "SAR", + "PSE", + "SEED", + "SDMGR", + "LayoutXLM", + "LayoutLM", + "LayoutLMv2", + "PREN", + "FCE", + "SVTR", + "SVTR_LCNet", + "ViTSTR", + "ABINet", + "DB++", + "TableMaster", + "SPIN", + "VisionLAN", + "Gestalt", + "SLANet", + "RobustScanner", + "CT", + "RFL", + "DRRG", + "CAN", + "Telescope", + "SATRN", + "SVTR_HGNet", + "ParseQ", + "CPPD", + ] + + if use_xpu: + device = "xpu:{0}".format(os.getenv("FLAGS_selected_xpus", 0)) + elif use_npu: + device = "npu:{0}".format(os.getenv("FLAGS_selected_npus", 0)) + elif use_mlu: + device = "mlu:{0}".format(os.getenv("FLAGS_selected_mlus", 0)) + else: + device = "gpu:{}".format(dist.ParallelEnv().dev_id) if use_gpu else "cpu" + check_device(use_gpu, use_xpu, use_npu, use_mlu) + + device = paddle.set_device(device) + + config["Global"]["distributed"] = dist.get_world_size() != 1 + + loggers = [] + + if "use_visualdl" in config["Global"] and config["Global"]["use_visualdl"]: + logger.warning( + "You are using VisualDL, the VisualDL is deprecated and " + "removed in ppocr!" + ) + log_writer = None + if ( + "use_wandb" in config["Global"] and config["Global"]["use_wandb"] + ) or "wandb" in config: + save_dir = config["Global"]["save_model_dir"] + wandb_writer_path = "{}/wandb".format(save_dir) + if "wandb" in config: + wandb_params = config["wandb"] + else: + wandb_params = dict() + wandb_params.update({"save_dir": save_dir}) + log_writer = WandbLogger(**wandb_params, config=config) + loggers.append(log_writer) + else: + log_writer = None + print_dict(config, logger) + + if loggers: + log_writer = Loggers(loggers) + else: + log_writer = None + + logger.info("train with paddle {} and device {}".format(paddle.__version__, device)) + return config, device, logger, log_writer diff --git a/docling_ibm_models/slanet_1m/requirements.txt b/docling_ibm_models/slanet_1m/requirements.txt new file mode 100644 index 0000000..a11cb3f --- /dev/null +++ b/docling_ibm_models/slanet_1m/requirements.txt @@ -0,0 +1,21 @@ +paddleocr==2.8.0 +dvc[s3]==3.58.0 +openpyxl +premailer +shapely==2.0.6 +scikit-image==0.24.0 +imgaug==0.4.0 +pyclipper==1.3.0.post6 +lmdb==1.5.1 +tqdm==4.66.5 +numpy==1.26.4 +RapidFuzz==3.10.0 +Cython==3.0.11 +pillow==11.0.0 +PyYAML==6.0.1 +requests==2.32.3 +albumentations==1.4.10 +# to be compatible with albumentations +albucore==0.0.13 +#TEDS +apted==1.0.3 \ No newline at end of file diff --git a/docling_ibm_models/slanet_1m/src/eval.py b/docling_ibm_models/slanet_1m/src/eval.py new file mode 100644 index 0000000..4dcf353 --- /dev/null +++ b/docling_ibm_models/slanet_1m/src/eval.py @@ -0,0 +1,174 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import json + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, __dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, ".."))) + +import paddle +from paddleocr.ppocr.data import build_dataloader, set_signal_handlers +from modeling.architectures import build_model +from paddleocr.ppocr.postprocess import build_post_process +from metrics import build_metric +from paddleocr.ppocr.utils.save_load import load_model +import program as program + + +def main(): + global_config = config["Global"] + # build dataloader + set_signal_handlers() + valid_dataloader = build_dataloader(config, "Eval", device, logger) + + # build post process + post_process_class = build_post_process(config["PostProcess"], global_config) + + # build model + # for rec algorithm + if hasattr(post_process_class, "character"): + char_num = len(getattr(post_process_class, "character")) + if config["Architecture"]["algorithm"] in [ + "Distillation", + ]: # distillation model + for key in config["Architecture"]["Models"]: + if ( + config["Architecture"]["Models"][key]["Head"]["name"] == "MultiHead" + ): # for multi head + out_channels_list = {} + if config["PostProcess"]["name"] == "DistillationSARLabelDecode": + char_num = char_num - 2 + if config["PostProcess"]["name"] == "DistillationNRTRLabelDecode": + char_num = char_num - 3 + out_channels_list["CTCLabelDecode"] = char_num + out_channels_list["SARLabelDecode"] = char_num + 2 + out_channels_list["NRTRLabelDecode"] = char_num + 3 + config["Architecture"]["Models"][key]["Head"][ + "out_channels_list" + ] = out_channels_list + else: + config["Architecture"]["Models"][key]["Head"][ + "out_channels" + ] = char_num + elif config["Architecture"]["Head"]["name"] == "MultiHead": # for multi head + out_channels_list = {} + if config["PostProcess"]["name"] == "SARLabelDecode": + char_num = char_num - 2 + if config["PostProcess"]["name"] == "NRTRLabelDecode": + char_num = char_num - 3 + out_channels_list["CTCLabelDecode"] = char_num + out_channels_list["SARLabelDecode"] = char_num + 2 + out_channels_list["NRTRLabelDecode"] = char_num + 3 + config["Architecture"]["Head"]["out_channels_list"] = out_channels_list + else: # base rec model + config["Architecture"]["Head"]["out_channels"] = char_num + + model = build_model(config["Architecture"]) + extra_input_models = [ + "SRN", + "NRTR", + "SAR", + "SEED", + "SVTR", + "SVTR_LCNet", + "VisionLAN", + "RobustScanner", + "SVTR_HGNet", + ] + extra_input = False + if config["Architecture"]["algorithm"] == "Distillation": + for key in config["Architecture"]["Models"]: + extra_input = ( + extra_input + or config["Architecture"]["Models"][key]["algorithm"] + in extra_input_models + ) + else: + extra_input = config["Architecture"]["algorithm"] in extra_input_models + if "model_type" in config["Architecture"].keys(): + if config["Architecture"]["algorithm"] == "CAN": + model_type = "can" + else: + model_type = config["Architecture"]["model_type"] + else: + model_type = None + + # build metric + eval_class = build_metric(config["Metric"]) + # amp + use_amp = config["Global"].get("use_amp", False) + amp_level = config["Global"].get("amp_level", "O2") + amp_custom_black_list = config["Global"].get("amp_custom_black_list", []) + if use_amp: + AMP_RELATED_FLAGS_SETTING = { + "FLAGS_cudnn_batchnorm_spatial_persistent": 1, + "FLAGS_max_inplace_grad_add": 8, + } + paddle.set_flags(AMP_RELATED_FLAGS_SETTING) + scale_loss = config["Global"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["Global"].get( + "use_dynamic_loss_scaling", False + ) + scaler = paddle.amp.GradScaler( + init_loss_scaling=scale_loss, + use_dynamic_loss_scaling=use_dynamic_loss_scaling, + ) + if amp_level == "O2": + model = paddle.amp.decorate( + models=model, level=amp_level, master_weight=True + ) + else: + scaler = None + + best_model_dict = load_model( + config, model, model_type=config["Architecture"]["model_type"] + ) + if len(best_model_dict): + logger.info("metric in ckpt ***************") + for k, v in best_model_dict.items(): + logger.info("{}:{}".format(k, v)) + + # start eval + metric = program.eval( + model, + valid_dataloader, + post_process_class, + eval_class, + model_type, + extra_input, + scaler, + amp_level, + amp_custom_black_list, + ) + os.makedirs("evaluation", exist_ok=True) + + # Save metrics to evaluation/metrics.json + with open("evaluation/metrics.json", "w") as f: + json.dump(metric, f, indent=4) + + logger.info("metric eval ***************") + for k, v in metric.items(): + logger.info("{}:{}".format(k, v)) + + +if __name__ == "__main__": + config, device, logger, vdl_writer = program.preprocess() + main() diff --git a/docling_ibm_models/slanet_1m/src/eval_table.py b/docling_ibm_models/slanet_1m/src/eval_table.py new file mode 100644 index 0000000..58ebaa4 --- /dev/null +++ b/docling_ibm_models/slanet_1m/src/eval_table.py @@ -0,0 +1,112 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "../.."))) + +import cv2 +import pickle +import paddle +from tqdm import tqdm +from paddleocr.ppstructure.table.table_metric import TEDS +from paddleocr.ppstructure.table.predict_table import TableSystem +from paddleocr.ppstructure.utility import init_args +from paddleocr.ppocr.utils.logging import get_logger + +logger = get_logger() + + +def parse_args(): + parser = init_args() + parser.add_argument("--gt_path", type=str) + return parser.parse_args() + + +def load_txt(txt_path): + pred_html_dict = {} + if not os.path.exists(txt_path): + return pred_html_dict + with open(txt_path, encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + line = line.strip().split("\t") + img_name, pred_html = line + pred_html_dict[img_name] = pred_html + return pred_html_dict + + +def load_result(path): + data = {} + if os.path.exists(path): + data = pickle.load(open(path, "rb")) + return data + + +def save_result(path, data): + old_data = load_result(path) + old_data.update(data) + with open(path, "wb") as f: + pickle.dump(old_data, f) + + +def main(gt_path, img_root, args): + os.makedirs(args.output, exist_ok=True) + # init TableSystem + text_sys = TableSystem(args) + # load gt and preds html result + gt_html_dict = load_txt(gt_path) + + ocr_result = load_result(os.path.join(args.output, "ocr.pickle")) + structure_result = load_result(os.path.join(args.output, "structure.pickle")) + + pred_htmls = [] + gt_htmls = [] + for img_name, gt_html in tqdm(gt_html_dict.items()): + img = cv2.imread(os.path.join(img_root, img_name)) + # run ocr and save result + if img_name not in ocr_result: + dt_boxes, rec_res, _, _ = text_sys._ocr(img) + ocr_result[img_name] = [dt_boxes, rec_res] + save_result(os.path.join(args.output, "ocr.pickle"), ocr_result) + # run structure and save result + if img_name not in structure_result: + structure_res, _ = text_sys._structure(img) + structure_result[img_name] = structure_res + save_result(os.path.join(args.output, "structure.pickle"), structure_result) + dt_boxes, rec_res = ocr_result[img_name] + structure_res = structure_result[img_name] + # match ocr and structure + pred_html = text_sys.match(structure_res, dt_boxes, rec_res) + + pred_htmls.append(pred_html) + gt_htmls.append(gt_html) + + # print(pred_htmls) + # print(gt_htmls) + # compute teds + teds = TEDS(n_jobs=16, structure_only=True) + teds2 = TEDS(n_jobs=16) + scores = teds.batch_evaluate_html(gt_htmls, pred_htmls) + scores2 = teds2.batch_evaluate_html(gt_htmls, pred_htmls) + logger.info("s-teds: {}".format(sum(scores) / len(scores))) + logger.info("teds: {}".format(sum(scores2) / len(scores2))) + + +if __name__ == "__main__": + args = parse_args() + main(args.gt_path, args.image_dir, args) diff --git a/docling_ibm_models/slanet_1m/src/train.py b/docling_ibm_models/slanet_1m/src/train.py new file mode 100644 index 0000000..c2abe41 --- /dev/null +++ b/docling_ibm_models/slanet_1m/src/train.py @@ -0,0 +1,256 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys + +__dir__ = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(__dir__) +sys.path.insert(0, os.path.abspath(os.path.join(__dir__, ".."))) + +import yaml +import paddle +import paddle.distributed as dist + +from paddleocr.ppocr.data import build_dataloader, set_signal_handlers +from modeling.architectures import build_model +from losses import build_loss +from optimizer import build_optimizer +from paddleocr.ppocr.postprocess import build_post_process +from metrics import build_metric +from paddleocr.ppocr.utils.save_load import load_model +from paddleocr.ppocr.utils.utility import set_seed +from modeling.architectures import apply_to_static +import program as program + +dist.get_world_size() + + +def main(config, device, logger, vdl_writer, seed): + # init dist environment + if config["Global"]["distributed"]: + dist.init_parallel_env() + + global_config = config["Global"] + + # build dataloader + set_signal_handlers() + train_dataloader = build_dataloader(config, "Train", device, logger, seed) + if len(train_dataloader) == 0: + logger.error( + "No Images in train dataset, please ensure\n" + + "\t1. The images num in the train label_file_list should be larger than or equal with batch size.\n" + + "\t2. The annotation file and path in the configuration file are provided normally." + ) + return + + if config["Eval"]: + valid_dataloader = build_dataloader(config, "Eval", device, logger, seed) + else: + valid_dataloader = None + step_pre_epoch = len(train_dataloader) + + # build post process + post_process_class = build_post_process(config["PostProcess"], global_config) + + # build model + # for rec algorithm + if hasattr(post_process_class, "character"): + char_num = len(getattr(post_process_class, "character")) + if config["Architecture"]["algorithm"] in [ + "Distillation", + ]: # distillation model + for key in config["Architecture"]["Models"]: + if ( + config["Architecture"]["Models"][key]["Head"]["name"] == "MultiHead" + ): # for multi head + if config["PostProcess"]["name"] == "DistillationSARLabelDecode": + char_num = char_num - 2 + if config["PostProcess"]["name"] == "DistillationNRTRLabelDecode": + char_num = char_num - 3 + out_channels_list = {} + out_channels_list["CTCLabelDecode"] = char_num + # update SARLoss params + if ( + list(config["Loss"]["loss_config_list"][-1].keys())[0] + == "DistillationSARLoss" + ): + config["Loss"]["loss_config_list"][-1]["DistillationSARLoss"][ + "ignore_index" + ] = (char_num + 1) + out_channels_list["SARLabelDecode"] = char_num + 2 + elif any( + "DistillationNRTRLoss" in d + for d in config["Loss"]["loss_config_list"] + ): + out_channels_list["NRTRLabelDecode"] = char_num + 3 + + config["Architecture"]["Models"][key]["Head"][ + "out_channels_list" + ] = out_channels_list + else: + config["Architecture"]["Models"][key]["Head"][ + "out_channels" + ] = char_num + elif config["Architecture"]["Head"]["name"] == "MultiHead": # for multi head + if config["PostProcess"]["name"] == "SARLabelDecode": + char_num = char_num - 2 + if config["PostProcess"]["name"] == "NRTRLabelDecode": + char_num = char_num - 3 + out_channels_list = {} + out_channels_list["CTCLabelDecode"] = char_num + # update SARLoss params + if list(config["Loss"]["loss_config_list"][1].keys())[0] == "SARLoss": + if config["Loss"]["loss_config_list"][1]["SARLoss"] is None: + config["Loss"]["loss_config_list"][1]["SARLoss"] = { + "ignore_index": char_num + 1 + } + else: + config["Loss"]["loss_config_list"][1]["SARLoss"]["ignore_index"] = ( + char_num + 1 + ) + out_channels_list["SARLabelDecode"] = char_num + 2 + elif list(config["Loss"]["loss_config_list"][1].keys())[0] == "NRTRLoss": + out_channels_list["NRTRLabelDecode"] = char_num + 3 + config["Architecture"]["Head"]["out_channels_list"] = out_channels_list + else: # base rec model + config["Architecture"]["Head"]["out_channels"] = char_num + + if config["PostProcess"]["name"] == "SARLabelDecode": # for SAR model + config["Loss"]["ignore_index"] = char_num - 1 + + model = build_model(config["Architecture"]) + + use_sync_bn = config["Global"].get("use_sync_bn", False) + if use_sync_bn: + model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) + logger.info("convert_sync_batchnorm") + + model = apply_to_static(model, config, logger) + + # build loss + loss_class = build_loss(config["Loss"]) + + # build optim + optimizer, lr_scheduler = build_optimizer( + config["Optimizer"], + epochs=config["Global"]["epoch_num"], + step_each_epoch=len(train_dataloader), + model=model, + ) + + # build metric + eval_class = build_metric(config["Metric"]) + + logger.info("train dataloader has {} iters".format(len(train_dataloader))) + if valid_dataloader is not None: + logger.info("valid dataloader has {} iters".format(len(valid_dataloader))) + + use_amp = config["Global"].get("use_amp", False) + amp_level = config["Global"].get("amp_level", "O2") + amp_dtype = config["Global"].get("amp_dtype", "float16") + amp_custom_black_list = config["Global"].get("amp_custom_black_list", []) + amp_custom_white_list = config["Global"].get("amp_custom_white_list", []) + if use_amp: + AMP_RELATED_FLAGS_SETTING = { + "FLAGS_max_inplace_grad_add": 8, + } + if paddle.is_compiled_with_cuda(): + AMP_RELATED_FLAGS_SETTING.update( + { + "FLAGS_cudnn_batchnorm_spatial_persistent": 1, + "FLAGS_gemm_use_half_precision_compute_type": 0, + } + ) + paddle.set_flags(AMP_RELATED_FLAGS_SETTING) + scale_loss = config["Global"].get("scale_loss", 1.0) + use_dynamic_loss_scaling = config["Global"].get( + "use_dynamic_loss_scaling", False + ) + scaler = paddle.amp.GradScaler( + init_loss_scaling=scale_loss, + use_dynamic_loss_scaling=use_dynamic_loss_scaling, + ) + if amp_level == "O2": + model, optimizer = paddle.amp.decorate( + models=model, + optimizers=optimizer, + level=amp_level, + master_weight=True, + dtype=amp_dtype, + ) + else: + scaler = None + + # load pretrain model + pre_best_model_dict = load_model( + config, model, optimizer, config["Architecture"]["model_type"] + ) + + if config["Global"]["distributed"]: + model = paddle.DataParallel(model) + # start train + program.train( + config, + train_dataloader, + valid_dataloader, + device, + model, + loss_class, + optimizer, + lr_scheduler, + post_process_class, + eval_class, + pre_best_model_dict, + logger, + step_pre_epoch, + vdl_writer, + scaler, + amp_level, + amp_custom_black_list, + amp_custom_white_list, + amp_dtype, + ) + + +def test_reader(config, device, logger): + loader = build_dataloader(config, "Train", device, logger) + import time + + starttime = time.time() + count = 0 + try: + for data in loader(): + count += 1 + if count % 1 == 0: + batch_time = time.time() - starttime + starttime = time.time() + logger.info( + "reader: {}, {}, {}".format(count, len(data[0]), batch_time) + ) + except Exception as e: + logger.info(e) + logger.info("finish reader: {}, Success!".format(count)) + + +if __name__ == "__main__": + config, device, logger, vdl_writer = program.preprocess(is_train=True) + seed = config["Global"]["seed"] if "seed" in config["Global"] else 1024 + set_seed(seed) + main(config, device, logger, vdl_writer, seed) + # test_reader(config, device, logger)