Forum Archive

Text detection with Vison coreML

pavlinb

Hello,

I need help to implement text detection with Vision + coreML in Pythonista.

Does someone have an example?

Regards.

cvp

@pavlinb see here

cvp

Try the script with a photo of one letter with this .mlmodel to be downloaded

Two little modifications in the script:

MODEL_FILENAME = 'mobilenet.mlmodel'
MODEL_FILENAME = 'Alphanum_28x28.mlmodel'
        all_assets = photos.get_assets()
        asset = photos.pick_asset(assets=all_assets)
        #asset = photos.pick_asset()

pavlinb

Thank you for fast responce. I was inspired exactly from given link.

Do you know the proper way of using VNDetectTextRectanglesRequest to obtain text form image in pythonista?

Kind regards.

cvp

@pavlinb try this quick and dirty (but working πŸ˜€) code on an image containing text

# from https://github.com/SwiftBrain/HelloVision/blob/master/HelloVision/TextDetectionViewController.swift
from objc_util import *
import photos
import ui

VNImageRequestHandler = ObjCClass('VNImageRequestHandler')
VNDetectTextRectanglesRequest = ObjCClass('VNDetectTextRectanglesRequest')

def classify_asset(asset):
    img_data = ns(asset.get_image_data().getvalue())

    req = VNDetectTextRectanglesRequest.alloc().init()

    handler = VNImageRequestHandler.alloc().initWithData_options_(img_data, None).autorelease()
    success = handler.performRequests_error_([req], None)
    if success:
        im = ui.ImageView()
        ui_image = asset.get_ui_image()
        wi,hi = ui_image.size
        im.frame = (0,0,400,400*hi/wi)
        wi = im.width
        hi = im.height
        im.image = ui_image
        im.present('sheet')
        for i in range(0,len(req.results())):
            observation = req.results()[i]  
            box = observation.boundingBox()
            x=box.origin.x
            y=box.origin.y
            w=box.size.width
            h=box.size.height
            print('x=',x)
            print('y=',y)
            print('width=',w)
            print('height=',h)
            l = ui.Label()
            l.frame = (x*wi,y*hi,w*wi,h*hi)
            print(l.frame)
            l.border_width = 1
            l.border_color = 'red'
            im.add_subview(l)
            #print(dir(observation.boundingBox()))
            confidence = observation.confidence()
            print('confidence', confidence)
        print('ok')
    else:
        print('error')

def main():
    all_assets = photos.get_assets()
    asset = photos.pick_asset(assets=all_assets)
    if asset is None:
        return
    classify_asset(asset)

if __name__ == '__main__':
    main()

pavlinb

Cool. Thanks.

But is there any way to gues the proper format of native iOS frameworks in Pythonista?

There are lot of functiond from Vison that I want to test...

Regards.

cvp

@pavlinb examples and experience and forum πŸ˜€

pavlinb

@cvp said:

Try the script with a photo of one letter with this .mlmodel to be downloaded

Two little modifications in the script:
MODEL_FILENAME = 'mobilenet.mlmodel' MODEL_FILENAME = 'Alphanum_28x28.mlmodel'
all_assets = photos.get_assets() asset = photos.pick_asset(assets=all_assets) #asset = photos.pick_asset()

This example works good on one character. Is there a way to make it working on more symbols?

cvp

@pavlinb Try this script and then, try to mix it with the script with mlmodel

# from https://github.com/SwiftBrain/HelloVision/blob/master/HelloVision/TextDetectionViewController.swift
from objc_util import *
import photos
import ui

VNImageRequestHandler = ObjCClass('VNImageRequestHandler')
VNDetectTextRectanglesRequest = ObjCClass('VNDetectTextRectanglesRequest')

def classify_asset(asset):
    img_data = ns(asset.get_image_data().getvalue())

    req = VNDetectTextRectanglesRequest.alloc().init()
    req.reportCharacterBoxes = True

    handler = VNImageRequestHandler.alloc().initWithData_options_(img_data, None).autorelease()
    success = handler.performRequests_error_([req], None)
    if success:
        im = ui.ImageView()
        ui_image = asset.get_ui_image()
        wi,hi = ui_image.size
        im.frame = (0,0,400,400*hi/wi)
        wi = im.width
        hi = im.height
        im.image = ui_image
        im.present('sheet')
        for i in range(0,len(req.results())):
            observation = req.results()[i]  
            box = observation.boundingBox()
            x=box.origin.x
            y=box.origin.y
            w=box.size.width
            h=box.size.height
            print('x=',x)
            print('y=',y)
            print('width=',w)
            print('height=',h)
            l = ui.Label()
            l.frame = (x*wi,y*hi,w*wi,h*hi)
            print(l.frame)
            l.border_width = 1
            l.border_color = 'red'
            im.add_subview(l)
            print(dir(observation))
            confidence = observation.confidence()
            print('confidence', confidence)
            for i_ch in range(0,len(observation.characterBoxes())):
              ch_box = observation.characterBoxes()[i_ch]
              box = ch_box.boundingBox()
              x=box.origin.x
              y=box.origin.y
              w=box.size.width
              h=box.size.height
              print('x=',x)
              print('y=',y)
              print('width=',w)
              print('height=',h)
              l = ui.Label()
              l.frame = (x*wi,y*hi,w*wi,h*hi)
              print(l.frame)
              l.border_width = 1
              l.border_color = 'blue'
              im.add_subview(l)
              #print(dir(ch_box))
              #break
        print('ok')
    else:
        print('error')

def main():
    all_assets = photos.get_assets()
    asset = photos.pick_asset(assets=all_assets)
    if asset is None:
        return
    classify_asset(asset)

if __name__ == '__main__':
    main()

cvp

@pavlinb should work but result is not correct 😒

#!python3
'''
This is a demo of how you can use the CoreML framework (via objc_util) to classify images in Pythonista. It downloads the trained 'MobileNet' CoreML model from the Internet, and uses it to classify images that are either taken with the camera, or picked from the photo library.
'''

import requests
import os
import io
import photos
import dialogs
from PIL import Image
from objc_util import ObjCClass, nsurl, ns
import ui

# Configuration (change URL and filename if you want to use a different model):
MODEL_URL = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
MODEL_FILENAME = 'Alphanum_28x28.mlmodel'

# Use a local path for caching the model file (no need to sync this with iCloud):
MODEL_PATH = os.path.join(os.path.expanduser('~/Documents'), MODEL_FILENAME)

# Declare/import ObjC classes:
MLModel = ObjCClass('MLModel')
VNCoreMLModel = ObjCClass('VNCoreMLModel')
VNCoreMLRequest = ObjCClass('VNCoreMLRequest')

VNImageRequestHandler = ObjCClass('VNImageRequestHandler')
VNDetectTextRectanglesRequest = ObjCClass('VNDetectTextRectanglesRequest')


def load_model():
    '''Helper method for downloading/caching the mlmodel file'''
    if not os.path.exists(MODEL_PATH):
        print(f'Downloading model: {MODEL_FILENAME}...')
        r = requests.get(MODEL_URL, stream=True)
        file_size = int(r.headers['content-length'])
        with open(MODEL_PATH, 'wb') as f:
            bytes_written = 0
            for chunk in r.iter_content(1024*100):
                f.write(chunk)
                print(f'{bytes_written/file_size*100:.2f}% downloaded')
                bytes_written += len(chunk)
        print('Download finished')
    ml_model_url = nsurl(MODEL_PATH)
    # Compile the model:
    c_model_url = MLModel.compileModelAtURL_error_(ml_model_url, None)
    # Load model from the compiled model file:
    ml_model = MLModel.modelWithContentsOfURL_error_(c_model_url, None)
    # Create a VNCoreMLModel from the MLModel for use with the Vision framework:
    vn_model = VNCoreMLModel.modelForMLModel_error_(ml_model, None)
    return vn_model


def _classify_img_data(img_data):
    '''The main image classification method, used by `classify_image` (for camera images) and `classify_asset` (for photo library assets).'''
    vn_model = load_model()
    # Create and perform the recognition request:
    req = VNCoreMLRequest.alloc().initWithModel_(vn_model).autorelease()
    handler = VNImageRequestHandler.alloc().initWithData_options_(img_data, None).autorelease()
    success = handler.performRequests_error_([req], None)
    if success:
        best_result = req.results()[0]
        label = str(best_result.identifier())
        confidence = best_result.confidence()
        return {'label': label, 'confidence': confidence}
    else:
        return None


def classify_image(img):
    buffer = io.BytesIO()
    img.save(buffer, 'JPEG')
    img_data = ns(buffer.getvalue())
    return _classify_img_data(img_data)

def classify_asset(asset):
    img_data = ns(asset.get_image_data().getvalue())

    req = VNDetectTextRectanglesRequest.alloc().init()
    req.reportCharacterBoxes = True

    handler = VNImageRequestHandler.alloc().initWithData_options_(img_data, None).autorelease()
    success = handler.performRequests_error_([req], None)
    if success:
        im = ui.ImageView()
        pil_image = asset.get_image()
        print(pil_image.size)
        ui_image = asset.get_ui_image()
        wim,him = ui_image.size
        im.frame = (0,0,400,400*him/wim)
        wi = im.width
        hi = im.height
        im.image = ui_image
        im.present('sheet')
        for i in range(0,len(req.results())):
            observation = req.results()[i]  
            box = observation.boundingBox()
            xb=box.origin.x
            yb=box.origin.y
            wb=box.size.width
            hb=box.size.height
            #print('x=',xb)
            #print('y=',y )
            #print('width=',w )
            #print('height=',hb)
            l = ui.Label()
            l.frame = (xb*wi,yb*hi,wb*wi,hb*hi)
            #print(l.frame)
            #l.border_width = 1
            #l.border_color = 'red'
            im.add_subview(l)
            #print(dir(observation))
            confidence = observation.confidence()
            #print('confidence', confidence)
            for i_ch in range(0,len(observation.characterBoxes())):
              ch_box = observation.characterBoxes()[i_ch]
              box = ch_box.boundingBox()
              x=box.origin.x
              y=box.origin.y
              w=box.size.width
              h=box.size.height
              #print('x=',x)
              #print('y=',y)
              #print('width=',w)
              #print('height=',h)
              l = ui.Label()
              l.frame = (x*wi,yb*hi,w*wi,hb*hi)
              #print(l.frame)
              #l.border_width = 1
              #l.border_color = 'blue'
              im.add_subview(l)
              print((int(x*wim),int(yb*him),int(w*wim),int(hb*him)))
              pil_char = pil_image.crop((int(x*wim)-1,int(yb*him)-1,int((x+w)*wim)+1,int((yb+hb)*him)+1))
              pil_char.show()
              print(classify_image(pil_char))
              #print(dir(ch_box))
              #break
        print('ok')
    else:
        print('error')



def main():
    all_assets = photos.get_assets()
    asset = photos.pick_asset(assets=all_assets)
    if asset is None:
        return
    classify_asset(asset)

if __name__ == '__main__':
    main()

cvp

@pavlinb install this model OCR.mlmodel
and my script works almost perfectly (F instead of E)

pavlinb

You are very kind, thanks.

You gave me lot of material for trying.

Are you using it with ipad?

cvp

@pavlinb yes sir, iPad mini 4

pavlinb

Do you know other mlmodels for OCR?

Regards.

cvp

@pavlinb You know, I didn't know anything before your post. I did search "text detection mlmodel" via Google and I did find these ones...

Bad results of my script with here and here

pavlinb

Ah, sorry then. You answer so fast and I thought you are master of mlmodel.

Thanks anyway for the links.

cvp

@pavlinb I knew the script of omz and I've found and converted the Swift code when I've read your post.

pavlinb

Can you help me with one issue I can't fix on iPhone8?
link text

cvp

@pavlinb that's what we see

cvp

Please post the script

pavlinb

Sorry,
Clink on green button Π˜Π—Π’Π•Π“Π›Π˜

cvp

@pavlinb I did, and after some questions I get the screen copy...

pavlinb

Here blue boxes are shifted from original text.

pavlinb
#!python3
'''
This is a demo of how you can use the CoreML framework (via objc_util) to classify images in Pythonista.
It downloads the trained 'MobileNet' CoreML model from the Internet, and uses it to classify images that
are either taken with the camera, or picked from the photo library.
'''

import requests
import os
import io
import photos
import dialogs
from PIL import Image
from objc_util import ObjCClass, nsurl, ns
import ui

# Configuration (change URL and filename if you want to use a different model):
MODEL_URL = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
#MODEL_FILENAME = 'Alphanum_28x28.mlmodel'
MODEL_FILENAME = 'OCR.mlmodel'
#MODEL_FILENAME = 'frozen_east_text_detection.pb.py'

# Use a local path for caching the model file (no need to sync this with iCloud):
MODEL_PATH = os.path.join(os.path.expanduser('~/Documents'), MODEL_FILENAME)

# Declare/import ObjC classes:
MLModel = ObjCClass('MLModel')
VNCoreMLModel = ObjCClass('VNCoreMLModel')
VNCoreMLRequest = ObjCClass('VNCoreMLRequest')

VNImageRequestHandler = ObjCClass('VNImageRequestHandler')
VNDetectTextRectanglesRequest = ObjCClass('VNDetectTextRectanglesRequest')


def load_model():
    '''Helper method for downloading/caching the mlmodel file'''
    if not os.path.exists(MODEL_PATH):
        print(f'Downloading model: {MODEL_FILENAME}...')
        r = requests.get(MODEL_URL, stream=True)
        file_size = int(r.headers['content-length'])
        with open(MODEL_PATH, 'wb') as f:
            bytes_written = 0
            for chunk in r.iter_content(1024*100):
                f.write(chunk)
                print(f'{bytes_written/file_size*100:.2f}% downloaded')
                bytes_written += len(chunk)
        print('Download finished')
    ml_model_url = nsurl(MODEL_PATH)
    # Compile the model:
    c_model_url = MLModel.compileModelAtURL_error_(ml_model_url, None)
    # Load model from the compiled model file:
    ml_model = MLModel.modelWithContentsOfURL_error_(c_model_url, None)
    # Create a VNCoreMLModel from the MLModel for use with the Vision framework:
    vn_model = VNCoreMLModel.modelForMLModel_error_(ml_model, None)
    return vn_model


def _classify_img_data(img_data):
    '''The main image classification method, used by `classify_image` (for camera images) and `classify_asset` (for photo library assets).'''
    vn_model = load_model()
    # Create and perform the recognition request:
    req = VNCoreMLRequest.alloc().initWithModel_(vn_model).autorelease()
    handler = VNImageRequestHandler.alloc().initWithData_options_(img_data, None).autorelease()
    success = handler.performRequests_error_([req], None)
    if success:
        best_result = req.results()[0]
        label = str(best_result.identifier())
        confidence = best_result.confidence()
        return {'label': label, 'confidence': confidence}
    else:
        return None


def classify_image(img):
    buffer = io.BytesIO()
    img.save(buffer, 'JPEG')
    img_data = ns(buffer.getvalue())
    return _classify_img_data(img_data)

def classify_asset(asset):
    img_data = ns(asset.get_image_data().getvalue())

    req = VNDetectTextRectanglesRequest.alloc().init()
    req.reportCharacterBoxes = True

    handler = VNImageRequestHandler.alloc().initWithData_options_(img_data, None).autorelease()
    success = handler.performRequests_error_([req], None)
    if success:
        im = ui.ImageView()
        pil_image = asset.get_image()
        print(pil_image.size)
        ui_image = asset.get_ui_image()
        wim,him = ui_image.size
        im.frame = (0,0,400,400*him/wim)
        #im.frame = (0,0,141,64)
        wi = im.width
        hi = im.height
        im.image = ui_image
        im.content_mode = 1 #1
        im.present()
        for i in range(0,len(req.results())):
            observation = req.results()[i]  
            box = observation.boundingBox()
            xb=box.origin.x
            yb=box.origin.y
            wb=box.size.width
            hb=box.size.height
            #print('x=',xb)
            #print('y=',y )
            #print('width=',w )
            #print('height=',hb)
            l = ui.Label()
            l.frame = (xb*wi,yb*hi,wb*wi,hb*hi)
            #print(l.frame)
            #l.border_width = 1
            #l.border_color = 'red'
            im.add_subview(l)
            #print(dir(observation))
            confidence = observation.confidence()
            #print('confidence', confidence)
            for i_ch in range(0,len(observation.characterBoxes())):
              ch_box = observation.characterBoxes()[i_ch]
              box = ch_box.boundingBox()
              x=box.origin.x
              y=box.origin.y
              w=box.size.width
              h=box.size.height
              #print('x=',x)
              #print('y=',y)
              #print('width=',w)
              #print('height=',h)
              l = ui.Label()
              l.frame = (x*wi,yb*hi,w*wi,hb*hi)
              #print(l.frame)
              l.border_width = 1
              l.border_color = 'blue'
              im.add_subview(l)
              print((int(x*wim),int(yb*him),int(w*wim),int(hb*him)))
              pil_char = pil_image.crop((int(x*wim)-1,int(yb*him)-1,int((x+w)*wim)+1,int((yb+hb)*him)+8))
              pil_char.show()
              print(classify_image(pil_char))
              #print(dir(ch_box))
              #break
        print('ok')
    else:
        print('error')



def main():
    all_assets = photos.get_assets()
    asset = photos.pick_asset(assets=all_assets)
    if asset is None:
        return
    classify_asset(asset)

if __name__ == '__main__':
    main()
cvp

@pavlinb I guess you changed a little my script...

pavlinb

Yes, a little.

cvp

@pavlinb problem comes from you did not present as 'sheet'

Then width and height of imageview are not these ones we set at begin, easy isn'it? πŸ˜…

pavlinb

With β€˜sheet’ even original image is extended to fullscreen. And regarding documentation of Pythonista sheet works on iPad only.

pavlinb

https://imgur.com/a/ogxpMPR

cvp

Sorry, as I work on iPad, I always forget that sheet does not exist on iPhone, this should be ok

        mv = ui.View()
        im = ui.ImageView()
        pil_image = asset.get_image()
        print(pil_image.size)
        ui_image = asset.get_ui_image()
        wim,him = ui_image.size
        ws,hs = ui.get_screen_size()
        if (ws/hs) > (wim/him):
          h = ws*him/wim
          im.frame = (0,(hs-h)/2,ws,h)
        else:
          w = hs*wim/him
          im.frame = ((ws-w)/2,0,w,hs)
        mv.add_subview(im)
        wi = im.width
        hi = im.height
        im.image = ui_image
        im.content_mode = 1 #1
        mv.present()
cvp

@pavlinb Tried?

cvp

@pavlinb did you try to replace

im = ui....
.
.
.
im.present()

By the lines just before?

pavlinb

Hi, it's like that:

cvp

@pavlinb for me, with 'full_screen'

pavlinb

Did you try on more natural images?

cvp

@pavlinb no, sincerely, I don't know anything about mlmodel. When I had read your topic, I only remembered an omz script and I have tried to add objectivec code for your request about VNDetectTextRectanglesRequest. That's all folks πŸ˜€

pavlinb

Here why it doesn’t work for me:

imgur

cvp

@pavlinb Imgur answer...

Oops! We couldn't find that page

Ok now

cvp

@pavlinb As I don't have this iPhone, could you print wim,him,ws,hs so I could simulate, thanks

cvp

@pavlinb could you also post the analyzed image in Imgur?

pavlinb

@cvp wim = 141, him = 64 , ws = 375, hs = 667

imgur

cvp

@pavlinb did you not turn your iPhone during test?
I simulate your dimensions and I get this.
I agree that blue rectangles are not very good but that's so, I think with this code

pavlinb

@cvp I'll play with dimensions.

pavlinb

@cvp In landscape view image looks better, but in portrait view is sketched and goes outside of the screen.

cvp

@pavlinb It is strange because I simulated your iPhone in portrait mode

cvp

I think I did an error, try < instead of >, and sorry 😐

        if (ws/hs) < (wim/him):
pavlinb

@cvp Much better.

Only blue boxes are little shifted.

But for now it's not important.

pavlinb

it seems that VNRecognizeTextRequest exist in iOS 13.0+ Beta, regarding link.

If I upgrade to iOS 13 Beta should I be able to call it from Pythonista?

cvp

@pavlinb No idea, sorry. I think that yes, you could.

cvp

@pavlinb ask somebody who has installed iOS 13 beta, to test

from objc_util import *
VNRecognizeTextRequest = ObjCClass('VNRecognizeTextRequest').alloc()

If it runs without error, that should be ok for next development

If you get

ValueError: no Objective-C class named 'b'VNRecognizeTextRequest'' found

It is not good...

pavlinb

@cvp Compiled w/o error in iOS13_7 beta.

Let see if it works good.

cvp

@pavlinb 🀞

pavlinb

@cvp Works good and no need models.

cvp

@pavlinb Super. I'm waiting for the official ios13...

mikael

@cvp, me too, and now much more eagerly. Thanks @pavlinb for the testing.

pavlinb

Is there a way to process every frame from camera?

Not only photos from gallery?

cvp

@pavlinb I think that there is a topic about that in the forum, but not sure

cvp

Not exactly what you want but..... see here

pavlinb

@cvp Checking now...

pavlinb

@cvp said:

@pavlinb I knew the script of omz and I've found and converted the Swift code when I've read your post.

Do you have some converter from Swift to Python or you did it manually?

cvp

@pavlinb Manually 😒 and I prefer to convert an Objectivec than a Swift...

pavlinb

@cvp I found a project in Swift that is good for Detecting in real time. But for me is hard to translate to Python.

cvp

@pavlinb You can post the url but I don't promise anything, if too long or too complex for me

pavlinb

@cvp https://developer.apple.com/documentation/vision/recognizing_objects_in_live_capture

There is download button at the top.

cvp

@pavlinb I think you (we?) Could do it, starting from @JonB script https://github.com/jsbain/objc_hacks/blob/master/live_camera_view.py

pavlinb

@cvp I thought the same :-)

cvp

@pavlinb First draft here

back camera on my iPad mini 4 does not work, I don't know why

pavlinb

@cvp class LiveCameraView(ui.View):
''' device=1== front, device=2==back

back camera is device=0

cvp

@pavlinb Thanks, gist corrected

pavlinb

@cvp A big step in right direction. Thanks.

cvp

@pavlinb Next step is to integrate AVCaptureVideoDataOutput from example here

If you follow the logic of this code, you will fall on UIImageFromSampleBuffer function
which is bery very complex (for me, at least)

pavlinb

@cvp My fifty cents link

cvp

@pavlinb Super. Do you think I have to try to intercept real frames of the camera or are you happy with this way of taking still photos?

cvp

@pavlinb On my (old) iPad mini 4, the script takes 2 photos per second, far from a real time process for frames...

pavlinb

@cvp For my current needs it’s perfect.

cvp

@pavlinb πŸ‘

pavlinb

@cvp But it would be good if i can avoid shutter sound. I assume it is caused by the method used to capture frames?

cvp

@pavlinb for info, AVCaptureStillImageOutput is deprecated, we should use AVCapturePhotoOutput

cvp

@pavlinb said:

But it would be good if i can avoid shutter sound. I assume it is caused by the method used to capture frames?

Following this,
I tried this without success

      c.AudioServicesDisposeSystemSoundID(1108,restype=None, argtypes=[c_int32]) # 1108 = shutter sound
pavlinb

@cvp said:

@pavlinb Next step is to integrate AVCaptureVideoDataOutput from example here

If you follow the logic of this code, you will fall on UIImageFromSampleBuffer function
which is bery very complex (for me, at least)

Hi, did you manage to run given code? It seems it contains number of functions (ready to use or not).

Regards.

cvp

@pavlinb No, sorry.

sodoku

which one of the code examples in this thread is the BEST one to copy and paste into my ipad mini 2 ?????????? for text detection for getting the starting numbers of a sodoku puzzle by taking a picture of a sodoku puzzle in a newspaper

ccc

https://forum.omz-software.com/topic/3050/snapsudoku

cvp

@sodoku I'm sorry but my iPad mini 4 is in iOS 13 with the last beta of Pythonista and my little script does not work very good any more...

cvp

@sodoku Shortest code I can give, without searching first the rectangles containing text, but assuming the image is a square grid with only one digit per cell.
You have to set (or ask, up to you) the number of cells per row/column, here 9, and the percentage (here 15) of the cell dimensions for the grid-lines. Even if the image seems to be entirely displayed, it is not fully because it is overriden by buttons where background_image is the cropped part which will be transmitted to the VNCoreMLRequest for trying to recognize the digit.
And, as you can see in my example, it does not work correctly on my ipad mini 4 under ios 13 and latest Pythonista beta. You could try on your idevice but I can't help more, sorry. Hoping it will be better for you 😒


import os
import io
import photos
import dialogs
from PIL import Image
from objc_util import ObjCClass, nsurl, ns
import ui

MODEL_FILENAME = 'MNIST.mlmodel'
#MODEL_FILENAME = 'MNISTClassifier.mlmodel'
#MODEL_FILENAME = 'OCR.mlmodel'
#MODEL_FILENAME = 'Alphanum_28x28.mlmodel'

# Use a local path for caching the model file
MODEL_PATH = os.path.join(os.path.expanduser('~/Documents/'), MODEL_FILENAME)

# Declare/import ObjC classes:
MLModel = ObjCClass('MLModel')
VNCoreMLModel = ObjCClass('VNCoreMLModel')
VNCoreMLRequest = ObjCClass('VNCoreMLRequest')
VNImageRequestHandler = ObjCClass('VNImageRequestHandler')

def pil2ui(imgIn):
    with io.BytesIO() as bIO:
        imgIn.save(bIO, 'PNG')
        imgOut = ui.Image.from_data(bIO.getvalue())
    del bIO
    return imgOut

def load_model():
    global vn_model
    ml_model_url = nsurl(MODEL_PATH)
    # Compile the model:
    c_model_url = MLModel.compileModelAtURL_error_(ml_model_url, None)
    # Load model from the compiled model file:
    ml_model = MLModel.modelWithContentsOfURL_error_(c_model_url, None)
    # Create a VNCoreMLModel from the MLModel for use with the Vision framework:
    vn_model = VNCoreMLModel.modelForMLModel_error_(ml_model, None)
    return vn_model

def _classify_img_data(img_data):
    global vn_model
    # Create and perform the recognition request:
    req = VNCoreMLRequest.alloc().initWithModel_(vn_model).autorelease()
    handler = VNImageRequestHandler.alloc().initWithData_options_(img_data, None).autorelease()
    success = handler.performRequests_error_([req], None)
    if success:
        best_result = req.results()[0]
        label = str(best_result.identifier())
        confidence = best_result.confidence()
        return {'label': label, 'confidence': confidence}
    else:
        return None

def classify_image(img):
    buffer = io.BytesIO()
    img.save(buffer, 'JPEG')
    img_data = ns(buffer.getvalue())
    return _classify_img_data(img_data)

def classify_asset(asset):
  mv = ui.View()
  mv.background_color = 'white'
  im = ui.ImageView()
  pil_image = asset.get_image()
  print(pil_image.size)
  ui_image = asset.get_ui_image()
  n_squares = 9
  d_grid = 15 # % around the digit
  wim,him = pil_image.size
  ws,hs = ui.get_screen_size()
  if (ws/hs) < (wim/him):
    h = ws*him/wim
    im.frame = (0,(hs-h)/2,ws,h)
  else:
    w = hs*wim/him
    im.frame = ((ws-w)/2,0,w,hs)
  print(wim,him,ws,hs)
  mv.add_subview(im)
  wi = im.width
  hi = im.height
  im.image = ui_image
  im.content_mode = 1 #1
  mv.frame = (0,0,ws,hs)     
  mv.present('fullscreen')
  dx = wim/n_squares
  dy = him/n_squares
  d = dx*d_grid/100
  dl = int((wi/n_squares)*d_grid/100)
  for ix in range(n_squares):
    x = ix*dx
    for iy in range(n_squares):
      y = iy*dy
      pil_char = pil_image.crop((int(x+d),int(y+d),int(x+dx-d),int(y+dy-d)))
      l = ui.Button()
      l.frame = (int(ix*wi/n_squares)+dl, int(iy*hi/n_squares)+dl, int(wi/n_squares)-2*dl, int(hi/n_squares)-2*dl)
      l.border_width = 1
      l.border_color = 'red'
      l.tint_color = 'red'
      ObjCInstance(l).button().contentHorizontalAlignment= 1 # left
      l.background_image = pil2ui(pil_char)
      im.add_subview(l)
      l.title = classify_image(pil_char)['label'] 

def main():
    global vn_model
    vn_model = load_model()
    all_assets = photos.get_assets()
    asset = photos.pick_asset(assets=all_assets)
    if asset is None:
      return
    classify_asset(asset)

if __name__ == '__main__':
    main()

ccc

@cvp Cool looking output! I am trying to follow along at home but have a few questions:
1. What image (of a sudoku puzzle) did you start with?
2. How did you constrain the labels to just 0-9?
3. Could we have a GitHub repo for this effort? With goals:
1. Recognize sudoku digits from a still image
2. Recognize sudoku digits from a real-time image

cvp

@ccc
1. I use an image of a sudoku grid from a Google search here
2. Use of MNIST.mlmodel for only digits
3. Not sure that it could be interesting because iOS 13 offers now a better way via VNRecognizeTextRequest but it was only to answer to @sodoku question

See topic with @mikael code

cvp

@ccc Do you have a iDevice under iOS < 13, just to know if digits recognition is working because my above test is not ok under iOS 13.

cvp

@ccc Really because you ask it πŸ˜€ Github

ccc

For non-Pythonista platforms... https://github.com/neeru1207/AI_Sudoku

pavlinb

Vision OCR produce prety good results.

But has someone succeeded to OCR text with indexes or power signs?

(https://commons.wikimedia.org/wiki/File:Quadratic-formula.jpg)

pavlinb

What about OCR in realtime video mode? Is it possible?

twinsant

See this topic: https://forum.omz-software.com/topic/6016/recognize-text-from-picture

enginsur

Can you help me with one issue I can't fix on iPhone 12 Pro?. .