adamlu1 commited on
Commit
b35e1d0
·
1 Parent(s): 39f8e6b

auto adj bbox width

Browse files
Files changed (2) hide show
  1. app.py +14 -29
  2. imgs/saved_image_demo.png +0 -0
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from typing import Optional
2
- import spaces
3
 
4
  import gradio as gr
5
  import numpy as np
@@ -25,31 +25,6 @@ caption_model_processor = {'processor': processor, 'model': model}
25
  print('finish loading model!!!')
26
 
27
 
28
- platform = 'pc'
29
- if platform == 'pc':
30
- draw_bbox_config = {
31
- 'text_scale': 0.8,
32
- 'text_thickness': 2,
33
- 'text_padding': 2,
34
- 'thickness': 2,
35
- }
36
- elif platform == 'web':
37
- draw_bbox_config = {
38
- 'text_scale': 0.8,
39
- 'text_thickness': 2,
40
- 'text_padding': 3,
41
- 'thickness': 3,
42
- }
43
- elif platform == 'mobile':
44
- draw_bbox_config = {
45
- 'text_scale': 0.8,
46
- 'text_thickness': 2,
47
- 'text_padding': 3,
48
- 'thickness': 3,
49
- }
50
-
51
-
52
-
53
  MARKDOWN = """
54
  # OmniParser for Pure Vision Based General GUI Agent 🔥
55
  <div>
@@ -59,6 +34,8 @@ MARKDOWN = """
59
  </div>
60
 
61
  OmniParser is a screen parsing tool to convert general GUI screen to structured elements.
 
 
62
  """
63
 
64
  # DEVICE = torch.device('cuda')
@@ -66,7 +43,7 @@ OmniParser is a screen parsing tool to convert general GUI screen to structured
66
  # @spaces.GPU
67
  @torch.inference_mode()
68
  # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
69
- @spaces.GPU(duration=65)
70
  def process(
71
  image_input,
72
  box_threshold,
@@ -76,6 +53,14 @@ def process(
76
  image_save_path = 'imgs/saved_image_demo.png'
77
  image_input.save(image_save_path)
78
  # import pdb; pdb.set_trace()
 
 
 
 
 
 
 
 
79
 
80
  ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
81
  text, ocr_bbox = ocr_bbox_rslt
@@ -117,5 +102,5 @@ with gr.Blocks() as demo:
117
  )
118
 
119
  # demo.launch(debug=False, show_error=True, share=True)
120
- # demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
121
- demo.queue().launch(share=False)
 
1
  from typing import Optional
2
+ # import spaces
3
 
4
  import gradio as gr
5
  import numpy as np
 
25
  print('finish loading model!!!')
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  MARKDOWN = """
29
  # OmniParser for Pure Vision Based General GUI Agent 🔥
30
  <div>
 
34
  </div>
35
 
36
  OmniParser is a screen parsing tool to convert general GUI screen to structured elements.
37
+
38
+ 📢 [[Project Page](https://microsoft.github.io/OmniParser/)] [[Blog Post](https://www.microsoft.com/en-us/research/articles/omniparser-for-pure-vision-based-gui-agent/)] [[Models](https://huggingface.co/microsoft/OmniParser)]
39
  """
40
 
41
  # DEVICE = torch.device('cuda')
 
43
  # @spaces.GPU
44
  @torch.inference_mode()
45
  # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
46
+ # @spaces.GPU(duration=65)
47
  def process(
48
  image_input,
49
  box_threshold,
 
53
  image_save_path = 'imgs/saved_image_demo.png'
54
  image_input.save(image_save_path)
55
  # import pdb; pdb.set_trace()
56
+ image = Image.open(image_save_path)
57
+ box_overlay_ratio = image.size[0] / 3200
58
+ draw_bbox_config = {
59
+ 'text_scale': 0.8 * box_overlay_ratio,
60
+ 'text_thickness': max(int(2 * box_overlay_ratio), 1),
61
+ 'text_padding': max(int(3 * box_overlay_ratio), 1),
62
+ 'thickness': max(int(3 * box_overlay_ratio), 1),
63
+ }
64
 
65
  ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_save_path, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=True)
66
  text, ocr_bbox = ocr_bbox_rslt
 
102
  )
103
 
104
  # demo.launch(debug=False, show_error=True, share=True)
105
+ demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
106
+ # demo.queue().launch(share=False)
imgs/saved_image_demo.png CHANGED