// Copyright 2016 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vision import ( "image" "golang.org/x/text/language" pb "google.golang.org/genproto/googleapis/cloud/vision/v1" "google.golang.org/grpc" "google.golang.org/grpc/codes" ) // Annotations contains all the annotations performed by the API on a single image. // A nil field indicates either that the corresponding feature was not requested, // or that annotation failed for that feature. type Annotations struct { // Faces holds the results of face detection. Faces []*FaceAnnotation // Landmarks holds the results of landmark detection. Landmarks []*EntityAnnotation // Logos holds the results of logo detection. Logos []*EntityAnnotation // Labels holds the results of label detection. Labels []*EntityAnnotation // Texts holds the results of text detection. Texts []*EntityAnnotation // FullText holds the results of full text (OCR) detection. FullText *TextAnnotation // SafeSearch holds the results of safe-search detection. SafeSearch *SafeSearchAnnotation // ImageProps contains properties of the annotated image. ImageProps *ImageProps // Web contains web annotations for the image. Web *WebDetection // CropHints contains crop hints for the image. CropHints []*CropHint // If non-nil, then one or more of the attempted annotations failed. // Non-nil annotations are guaranteed to be correct, even if Error is // non-nil. Error error } func annotationsFromProto(res *pb.AnnotateImageResponse) *Annotations { as := &Annotations{} for _, a := range res.FaceAnnotations { as.Faces = append(as.Faces, faceAnnotationFromProto(a)) } for _, a := range res.LandmarkAnnotations { as.Landmarks = append(as.Landmarks, entityAnnotationFromProto(a)) } for _, a := range res.LogoAnnotations { as.Logos = append(as.Logos, entityAnnotationFromProto(a)) } for _, a := range res.LabelAnnotations { as.Labels = append(as.Labels, entityAnnotationFromProto(a)) } for _, a := range res.TextAnnotations { as.Texts = append(as.Texts, entityAnnotationFromProto(a)) } as.FullText = textAnnotationFromProto(res.FullTextAnnotation) as.SafeSearch = safeSearchAnnotationFromProto(res.SafeSearchAnnotation) as.ImageProps = imagePropertiesFromProto(res.ImagePropertiesAnnotation) as.Web = webDetectionFromProto(res.WebDetection) as.CropHints = cropHintsFromProto(res.CropHintsAnnotation) if res.Error != nil { // res.Error is a google.rpc.Status. Convert to a Go error. Use a gRPC // error because it preserves the code as a separate field. // TODO(jba): preserve the details field. as.Error = grpc.Errorf(codes.Code(res.Error.Code), "%s", res.Error.Message) } return as } // A FaceAnnotation describes the results of face detection on an image. type FaceAnnotation struct { // BoundingPoly is the bounding polygon around the face. The coordinates of // the bounding box are in the original image's scale, as returned in // ImageParams. The bounding box is computed to "frame" the face in // accordance with human expectations. It is based on the landmarker // results. Note that one or more x and/or y coordinates may not be // generated in the BoundingPoly (the polygon will be unbounded) if only a // partial face appears in the image to be annotated. BoundingPoly []image.Point // FDBoundingPoly is tighter than BoundingPoly, and // encloses only the skin part of the face. Typically, it is used to // eliminate the face from any image analysis that detects the "amount of // skin" visible in an image. It is not based on the landmarker results, only // on the initial face detection, hence the fd (face detection) prefix. FDBoundingPoly []image.Point // Landmarks are detected face landmarks. Face FaceLandmarks // RollAngle indicates the amount of clockwise/anti-clockwise rotation of // the face relative to the image vertical, about the axis perpendicular to // the face. Range [-180,180]. RollAngle float32 // PanAngle is the yaw angle: the leftward/rightward angle that the face is // pointing, relative to the vertical plane perpendicular to the image. Range // [-180,180]. PanAngle float32 // TiltAngle is the pitch angle: the upwards/downwards angle that the face is // pointing relative to the image's horizontal plane. Range [-180,180]. TiltAngle float32 // DetectionConfidence is the detection confidence. The range is [0, 1]. DetectionConfidence float32 // LandmarkingConfidence is the face landmarking confidence. The range is [0, 1]. LandmarkingConfidence float32 // Likelihoods expresses the likelihood of various aspects of the face. Likelihoods *FaceLikelihoods } func faceAnnotationFromProto(pfa *pb.FaceAnnotation) *FaceAnnotation { fa := &FaceAnnotation{ BoundingPoly: boundingPolyFromProto(pfa.BoundingPoly), FDBoundingPoly: boundingPolyFromProto(pfa.FdBoundingPoly), RollAngle: pfa.RollAngle, PanAngle: pfa.PanAngle, TiltAngle: pfa.TiltAngle, DetectionConfidence: pfa.DetectionConfidence, LandmarkingConfidence: pfa.LandmarkingConfidence, Likelihoods: &FaceLikelihoods{ Joy: Likelihood(pfa.JoyLikelihood), Sorrow: Likelihood(pfa.SorrowLikelihood), Anger: Likelihood(pfa.AngerLikelihood), Surprise: Likelihood(pfa.SurpriseLikelihood), UnderExposed: Likelihood(pfa.UnderExposedLikelihood), Blurred: Likelihood(pfa.BlurredLikelihood), Headwear: Likelihood(pfa.HeadwearLikelihood), }, } populateFaceLandmarks(pfa.Landmarks, &fa.Face) return fa } // An EntityAnnotation describes the results of a landmark, label, logo or text // detection on an image. type EntityAnnotation struct { // ID is an opaque entity ID. Some IDs might be available in Knowledge Graph(KG). // For more details on KG please see: // https://developers.google.com/knowledge-graph/ ID string // Locale is the language code for the locale in which the entity textual // description (next field) is expressed. Locale string // Description is the entity textual description, expressed in the language of Locale. Description string // Score is the overall score of the result. Range [0, 1]. Score float32 // Confidence is the accuracy of the entity detection in an image. // For example, for an image containing the Eiffel Tower, this field represents // the confidence that there is a tower in the query image. Range [0, 1]. Confidence float32 // Topicality is the relevancy of the ICA (Image Content Annotation) label to the // image. For example, the relevancy of 'tower' to an image containing // 'Eiffel Tower' is likely higher than an image containing a distant towering // building, though the confidence that there is a tower may be the same. // Range [0, 1]. Topicality float32 // BoundingPoly is the image region to which this entity belongs. Not filled currently // for label detection. For text detection, BoundingPolys // are produced for the entire text detected in an image region, followed by // BoundingPolys for each word within the detected text. BoundingPoly []image.Point // Locations contains the location information for the detected entity. // Multiple LatLng structs can be present since one location may indicate the // location of the scene in the query image, and another the location of the // place where the query image was taken. Location information is usually // present for landmarks. Locations []LatLng // Properties are additional optional Property fields. // For example a different kind of score or string that qualifies the entity. Properties []Property } func entityAnnotationFromProto(e *pb.EntityAnnotation) *EntityAnnotation { var locs []LatLng for _, li := range e.Locations { locs = append(locs, latLngFromProto(li.LatLng)) } var props []Property for _, p := range e.Properties { props = append(props, propertyFromProto(p)) } return &EntityAnnotation{ ID: e.Mid, Locale: e.Locale, Description: e.Description, Score: e.Score, Confidence: e.Confidence, Topicality: e.Topicality, BoundingPoly: boundingPolyFromProto(e.BoundingPoly), Locations: locs, Properties: props, } } // TextAnnotation contains a structured representation of OCR extracted text. // The hierarchy of an OCR extracted text structure looks like: // TextAnnotation -> Page -> Block -> Paragraph -> Word -> Symbol // Each structural component, starting from Page, may further have its own // properties. Properties describe detected languages, breaks etc. type TextAnnotation struct { // List of pages detected by OCR. Pages []*Page // UTF-8 text detected on the pages. Text string } func textAnnotationFromProto(pta *pb.TextAnnotation) *TextAnnotation { if pta == nil { return nil } var pages []*Page for _, p := range pta.Pages { pages = append(pages, pageFromProto(p)) } return &TextAnnotation{ Pages: pages, Text: pta.Text, } } // A Page is a page of text detected from OCR. type Page struct { // Additional information detected on the page. Properties *TextProperties // Page width in pixels. Width int32 // Page height in pixels. Height int32 // List of blocks of text, images etc on this page. Blocks []*Block } func pageFromProto(p *pb.Page) *Page { if p == nil { return nil } var blocks []*Block for _, b := range p.Blocks { blocks = append(blocks, blockFromProto(b)) } return &Page{ Properties: textPropertiesFromProto(p.Property), Width: p.Width, Height: p.Height, Blocks: blocks, } } // A Block is a logical element on the page. type Block struct { // Additional information detected for the block. Properties *TextProperties // The bounding box for the block. // The vertices are in the order of top-left, top-right, bottom-right, // bottom-left. When a rotation of the bounding box is detected the rotation // is represented as around the top-left corner as defined when the text is // read in the 'natural' orientation. // For example: // * when the text is horizontal it might look like: // 0----1 // | | // 3----2 // * when it's rotated 180 degrees around the top-left corner it becomes: // 2----3 // | | // 1----0 // and the vertice order will still be (0, 1, 2, 3). BoundingBox []image.Point // List of paragraphs in this block (if this blocks is of type text). Paragraphs []*Paragraph // Detected block type (text, image etc) for this block. BlockType BlockType } // A BlockType represents the kind of Block (text, image, etc.) type BlockType int const ( // Unknown block type. UnknownBlock BlockType = BlockType(pb.Block_UNKNOWN) // Regular text block. TextBlock BlockType = BlockType(pb.Block_TEXT) // Table block. TableBlock BlockType = BlockType(pb.Block_TABLE) // Image block. PictureBlock BlockType = BlockType(pb.Block_PICTURE) // Horizontal/vertical line box. RulerBlock BlockType = BlockType(pb.Block_RULER) // Barcode block. BarcodeBlock BlockType = BlockType(pb.Block_BARCODE) ) func blockFromProto(p *pb.Block) *Block { if p == nil { return nil } var paras []*Paragraph for _, pa := range p.Paragraphs { paras = append(paras, paragraphFromProto(pa)) } return &Block{ Properties: textPropertiesFromProto(p.Property), BoundingBox: boundingPolyFromProto(p.BoundingBox), Paragraphs: paras, BlockType: BlockType(p.BlockType), } } // A Paragraph is a structural unit of text representing a number of words in // certain order. type Paragraph struct { // Additional information detected for the paragraph. Properties *TextProperties // The bounding box for the paragraph. // The vertices are in the order of top-left, top-right, bottom-right, // bottom-left. When a rotation of the bounding box is detected the rotation // is represented as around the top-left corner as defined when the text is // read in the 'natural' orientation. // For example: // * when the text is horizontal it might look like: // 0----1 // | | // 3----2 // * when it's rotated 180 degrees around the top-left corner it becomes: // 2----3 // | | // 1----0 // and the vertice order will still be (0, 1, 2, 3). BoundingBox []image.Point // List of words in this paragraph. Words []*Word } func paragraphFromProto(p *pb.Paragraph) *Paragraph { if p == nil { return nil } var words []*Word for _, w := range p.Words { words = append(words, wordFromProto(w)) } return &Paragraph{ Properties: textPropertiesFromProto(p.Property), BoundingBox: boundingPolyFromProto(p.BoundingBox), Words: words, } } // A Word is a word in a text document. type Word struct { // Additional information detected for the word. Properties *TextProperties // The bounding box for the word. // The vertices are in the order of top-left, top-right, bottom-right, // bottom-left. When a rotation of the bounding box is detected the rotation // is represented as around the top-left corner as defined when the text is // read in the 'natural' orientation. // For example: // * when the text is horizontal it might look like: // 0----1 // | | // 3----2 // * when it's rotated 180 degrees around the top-left corner it becomes: // 2----3 // | | // 1----0 // and the vertice order will still be (0, 1, 2, 3). BoundingBox []image.Point // List of symbols in the word. // The order of the symbols follows the natural reading order. Symbols []*Symbol } func wordFromProto(p *pb.Word) *Word { if p == nil { return nil } var syms []*Symbol for _, s := range p.Symbols { syms = append(syms, symbolFromProto(s)) } return &Word{ Properties: textPropertiesFromProto(p.Property), BoundingBox: boundingPolyFromProto(p.BoundingBox), Symbols: syms, } } // A Symbol is a symbol in a text document. type Symbol struct { // Additional information detected for the symbol. Properties *TextProperties // The bounding box for the symbol. // The vertices are in the order of top-left, top-right, bottom-right, // bottom-left. When a rotation of the bounding box is detected the rotation // is represented as around the top-left corner as defined when the text is // read in the 'natural' orientation. // For example: // * when the text is horizontal it might look like: // 0----1 // | | // 3----2 // * when it's rotated 180 degrees around the top-left corner it becomes: // 2----3 // | | // 1----0 // and the vertice order will still be (0, 1, 2, 3). BoundingBox []image.Point // The actual UTF-8 representation of the symbol. Text string } func symbolFromProto(p *pb.Symbol) *Symbol { if p == nil { return nil } return &Symbol{ Properties: textPropertiesFromProto(p.Property), BoundingBox: boundingPolyFromProto(p.BoundingBox), Text: p.Text, } } // TextProperties contains additional information about an OCR structural component. type TextProperties struct { // A list of detected languages together with confidence. DetectedLanguages []*DetectedLanguage // Detected start or end of a text segment. DetectedBreak *DetectedBreak } // Detected language for a structural component. type DetectedLanguage struct { // The BCP-47 language code, such as "en-US" or "sr-Latn". Code language.Tag // The confidence of the detected language, in the range [0, 1]. Confidence float32 } // DetectedBreak is the detected start or end of a structural component. type DetectedBreak struct { // The type of break. Type DetectedBreakType // True if break prepends the element. IsPrefix bool } type DetectedBreakType int const ( // Unknown break label type. UnknownBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_UNKNOWN) // Regular space. SpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_SPACE) // Sure space (very wide). SureSpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_SURE_SPACE) // Line-wrapping break. EOLSureSpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_EOL_SURE_SPACE) // End-line hyphen that is not present in text; does not co-occur with SPACE, LEADER_SPACE, or LINE_BREAK. HyphenBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_HYPHEN) // Line break that ends a paragraph. LineBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_LINE_BREAK) ) func textPropertiesFromProto(p *pb.TextAnnotation_TextProperty) *TextProperties { var dls []*DetectedLanguage for _, dl := range p.DetectedLanguages { tag, _ := language.Parse(dl.LanguageCode) // Ignore error. If err != nil the returned tag will not be garbage, // but a best-effort attempt at a parse. At worst it will be // language.Und, the documented "undefined" Tag. dls = append(dls, &DetectedLanguage{Code: tag, Confidence: dl.Confidence}) } var db *DetectedBreak if p.DetectedBreak != nil { db = &DetectedBreak{ Type: DetectedBreakType(p.DetectedBreak.Type), IsPrefix: p.DetectedBreak.IsPrefix, } } return &TextProperties{ DetectedLanguages: dls, DetectedBreak: db, } } // SafeSearchAnnotation describes the results of a SafeSearch detection on an image. type SafeSearchAnnotation struct { // Adult is the likelihood that the image contains adult content. Adult Likelihood // Spoof is the likelihood that an obvious modification was made to the // image's canonical version to make it appear funny or offensive. Spoof Likelihood // Medical is the likelihood that this is a medical image. Medical Likelihood // Violence is the likelihood that this image represents violence. Violence Likelihood } func safeSearchAnnotationFromProto(s *pb.SafeSearchAnnotation) *SafeSearchAnnotation { if s == nil { return nil } return &SafeSearchAnnotation{ Adult: Likelihood(s.Adult), Spoof: Likelihood(s.Spoof), Medical: Likelihood(s.Medical), Violence: Likelihood(s.Violence), } } // ImageProps describes properties of the image itself, like the dominant colors. type ImageProps struct { // DominantColors describes the dominant colors of the image. DominantColors []*ColorInfo } func imagePropertiesFromProto(ip *pb.ImageProperties) *ImageProps { if ip == nil || ip.DominantColors == nil { return nil } var cinfos []*ColorInfo for _, ci := range ip.DominantColors.Colors { cinfos = append(cinfos, colorInfoFromProto(ci)) } return &ImageProps{DominantColors: cinfos} } // WebDetection contains relevant information for the image from the Internet. type WebDetection struct { // Deduced entities from similar images on the Internet. WebEntities []*WebEntity // Fully matching images from the Internet. // They're definite neardups and most often a copy of the query image with // merely a size change. FullMatchingImages []*WebImage // Partial matching images from the Internet. // Those images are similar enough to share some key-point features. For // example an original image will likely have partial matching for its crops. PartialMatchingImages []*WebImage // Web pages containing the matching images from the Internet. PagesWithMatchingImages []*WebPage } func webDetectionFromProto(p *pb.WebDetection) *WebDetection { if p == nil { return nil } var ( wes []*WebEntity fmis, pmis []*WebImage wps []*WebPage ) for _, e := range p.WebEntities { wes = append(wes, webEntityFromProto(e)) } for _, m := range p.FullMatchingImages { fmis = append(fmis, webImageFromProto(m)) } for _, m := range p.PartialMatchingImages { pmis = append(fmis, webImageFromProto(m)) } for _, g := range p.PagesWithMatchingImages { wps = append(wps, webPageFromProto(g)) } return &WebDetection{ WebEntities: wes, FullMatchingImages: fmis, PartialMatchingImages: pmis, PagesWithMatchingImages: wps, } } // A WebEntity is an entity deduced from similar images on the Internet. type WebEntity struct { // Opaque entity ID. ID string // Overall relevancy score for the entity. // Not normalized and not comparable across different image queries. Score float32 // Canonical description of the entity, in English. Description string } func webEntityFromProto(p *pb.WebDetection_WebEntity) *WebEntity { return &WebEntity{ ID: p.EntityId, Score: p.Score, Description: p.Description, } } // WebImage contains metadata for online images. type WebImage struct { // The result image URL. URL string // Overall relevancy score for the image. // Not normalized and not comparable across different image queries. Score float32 } func webImageFromProto(p *pb.WebDetection_WebImage) *WebImage { return &WebImage{ URL: p.Url, Score: p.Score, } } // A WebPage contains metadata for web pages. type WebPage struct { // The result web page URL. URL string // Overall relevancy score for the web page. // Not normalized and not comparable across different image queries. Score float32 } func webPageFromProto(p *pb.WebDetection_WebPage) *WebPage { return &WebPage{ URL: p.Url, Score: p.Score, } } // CropHint is a single crop hint that is used to generate a new crop when // serving an image. type CropHint struct { // The bounding polygon for the crop region. The coordinates of the bounding // box are in the original image's scale, as returned in `ImageParams`. BoundingPoly []image.Point // Confidence of this being a salient region. Range [0, 1]. Confidence float32 // Fraction of importance of this salient region with respect to the original // image. ImportanceFraction float32 } func cropHintsFromProto(p *pb.CropHintsAnnotation) []*CropHint { if p == nil { return nil } var chs []*CropHint for _, pch := range p.CropHints { chs = append(chs, cropHintFromProto(pch)) } return chs } func cropHintFromProto(pch *pb.CropHint) *CropHint { return &CropHint{ BoundingPoly: boundingPolyFromProto(pch.BoundingPoly), Confidence: pch.Confidence, ImportanceFraction: pch.ImportanceFraction, } }