## makefile for LA-Strings

INCDIR=./framepac
BE_INCDIR=./bulk_ext
BE_APIDIR=./bulk_ext/src/be13_api
BULK_EXT_SO=

DESTDIR=/usr/bin
DBDIR=/usr/share/langident

OBJS = charset.o extract.o language.o score.o

DISTFILES = COPYING README CHANGELOG makefile manual.txt *.C *.h \
	test/*.txt test/combine.sh test/Copyright test/README \
	framepac/*.h framepac/*.C framepac/makefile framepac/makefile.?? \
	framepac/makefile.??? \
	langident/*.h langident/*.C langident/makefile langident/*.txt \
	util/test-random.sh util/icuconv.C util/icutrans.C util/interleave.c \
	util/dehtmlize.C util/score.C util/mktestset.sh util/eval.sh \
	util/prepare_file.sh util/counts.sh util/add-random.sh util/make*.sh \
	util/sort-err.sh util/makefile \
	Crubadan/MANIFEST Crubadan/README \
	Crubadan/High/* Crubadan/Med/* Crubadan/Low/* Crubadan/Rare/* \
	models/makefile models/*.lid models/MANIFEST

LIBRARY = la-strings.a

#########################################################################
## define compilation options

ifdef USE_BULK_EXT
BULK_EXT=-fPIC -DBULK_EXTRACTOR -I$(BE_INCDIR) -I$(BE_INCDIR)/src -I$(BE_APIDIR)
BULK_EXT_SO=scan_strings.so
THREADS=1
MAKE_SHAREDLIB=1
endif

ifndef BUILD_DBG
### compile fully optimized for distribution
BUILD_DBG=0
### compile with debugging info
#BUILD_DBG=1
### compile with debugging info and all optimizations off
#BUILD_DBG=2
endif

# build statically-linked executable (1=yes, 0=no)
#STATIC?=1
STATIC?=0

# enable multi-threading? (1=yes, 0=no)
#THREADS?=1
THREADS?=0

ifndef NODEBUG
#NODEBUG=-DNDEBUG
endif

ifndef GDB
#GDB = -ggdb3
endif

ifeq ($(DO_PROFILE),1)
PROFILE=-pg
#PROFILE=-DPURIFY
else
ifndef PROFILE
PROFILE=
endif
endif

ifeq ($(SANE),1)
SANITIZE=-fsanitize=thread -fPIE -DHELGRIND
LINKSAN=-pie
else ifeq ($(SANE),2)
SANITIZE=-fsanitize=address -fno-omit-framepointer -DPURIFY
else ifeq ($(SANE),3)
SANITIZE=-fsanitize=leak -DPURIFY
else ifeq ($(SANE),4)
SANITIZE=-fsanitize=memory -fno-omit-framepointer
else ifeq ($(SANE),5)
SANITIZE=-fsanitize=undefined
endif

ifndef CPU
## Uncomment the appropriate CPU type
### 486
#CPU=4
### Pentium
#CPU=5
### PentiumPro or higher
#CPU=6
### AMD Athlon; not supported by GCC 2.x
#CPU=7
### AMD64/x86_64 CPUs in 64-bit mode; not supported by GCC 2.x
###    (AMD K8 [Opteron/Athlon64], newest PentiumIV with EM64t)
#CPU=8
### AMD64 "PhenomII" (K10) or newer
#CPU=10
### Let GCC auto-determine CPU type, but assume at least CPU=8 capabilities
CPU=99
endif

ifndef BITS
#BITS=32
BITS=64
endif

ifeq ($(THREADS),1)
  PTHREAD=-pthread -DFrMULTITHREAD 
#-fopenmp
else
  PTHREAD=
endif

ifeq ($(STATIC),1)
  LINKTYPE=-static -z muldefs
else
  LINKTYPE=
endif

ifdef MAKE_SHAREDLIB
SHAREDLIB=-fPIC -DSHARED
endif

ifeq ($(NOICONV),1)
ICONV=-DNO_ICONV
else
ICONV=
endif

ifndef RELEASE
RELPATH=LAStrings
ZIPNAME=lastrings.zip
else
RELPATH=LAStrings-$(RELEASE)
ZIPNAME=lastrings-$(RELEASE).zip
endif

WARN=-Wall -Wextra -Wno-deprecated -Wshadow -Wcast-align -Wmissing-noreturn -Wmissing-format-attribute
#WARN += -Wunused-result (not on Doha)
#WARN += -Wno-multichar -Wpacked -Wdisabled-optimization -Wpadded

# explicitly force includes to check here first, to fix an incompatibility
#   with the templated iostreams (they don't have many of the functions
#   present in the old iostream)
#EXTRAINC=-I/usr/include/g++-3/

LINKBITS=-m$(BITS)
ifeq ($(CPU),99)
  # auto-detection, assuming at least AMD "K8" level of features (any
  #  x64 processor qualifies); required GCC 4.2+
  CPUDEF=-march=native -D__886__ -D__BITS__=$(BITS)
else ifeq ($(CPU),10)
  # newest AMD chips: "Barcelona", PhenomII
  CPUDEF=-march=amdfam10 -D__886__ -D__BITS__=$(BITS)
else ifeq ($(CPU),8)
  CPUDEF=-march=k8 -msse -D__BITS__=$(BITS)
else ifeq ($(CPU),7)
  CPUDEF=-march=athlon-xp -mmmx
else ifeq ($(CPU),6)
  CPUDEF=-march=i$(CPU)86 -mtune=i$(CPU)86 -mmmx
else
  CPUDEF=-march=i$(CPU)86 -mtune=i$(CPU)86
endif
ifneq ($(CPU),99)
CPUDEF += -D__$(CPU)86__
endif

CC = g++ -std=c++11
CCLINK = $(CC)
CFLAGS = $(WARN)
CFLAGS +=$(CPUDEF)

CFLAGS +=$(PTHREAD)
CFLAGS +=$(PROFILE)
CFLAGS +=$(ICONV)
CFLAGS +=$(NODEBUG)
CFLAGS +=$(LINKBITS) -pipe
CFLAGS +=$(EXTRAINC)
CFLAGS +=$(SANITIZE)
CFLAGS +=$(INCLUDEDIRS)
CFLAGS +=$(SHAREDLIB)
CFLAGS +=$(COMPILE_OPTS)
CFLAGEXE = -L$(LIBINSTDIR) $(PROFILE) -o $@
LINKFLAGS =$(LINKBITS)
LINKFLAGS +=$(LINKTYPE)
LINKFLAGS +=$(PTHREAD)
LINKFLAGS +=$(SANITIZE)
LINKFLAGS +=$(LINKSAN)
LINKFLAGS +=-L$(INCDIR)

ifeq ($(BUILD_DBG),2)
  CFLAGS += -ggdb3 -O0 -fno-inline -g3
else ifeq ($(BUILD_DBG),1)
  CFLAGS += -ggdb3 -O -g3
else
  CFLAGS += -O3 -fexpensive-optimizations -g$(DBGLVL) $(GDB)
# CFLAGS += -fweb -ftracer -fgcse-sm -fgcse-las -fno-math-errno
endif

ifeq ($(BUILD_DBG),1)
CFLAGS +=-O0 -fno-inline -ggdb3
CFLAGSLOOP=$(CFLAGS)
LINKFLAGS += -ggdb3
else ifeq ($(BUILD_DBG),-1)
CFLAGS +=-O3 -fexpensive-optimizations -ggdb3
CFLAGSLOOP=-fno-align-labels -fno-align-loops $(CFLAGS)
LINKFLAGS += -ggdb3
else
CFLAGS +=-O3 -fexpensive-optimizations -fmerge-constants
CFLAGSLOOP=-fno-align-labels -fno-align-loops $(CFLAGS)
endif

CFLAGEXE=-o $@ -m$(BITS) $(PROFILE)


#########################################################################
## define the programs needed to create the target library

LIBRARIAN=ar
LIBFLAGS=rucl
LIBINDEXER = ranlib
LIBIDXFLAGS = $(LIBRARY)

RM=rm -f
CP?=/bin/cp

#########################################################################
## standard targets

.PHONY: default
default: la-strings $(BULK_EXT_SO) langident/mklangid languages.db charsets.db
	@echo "top100.db and crubadan.db are not built by default -- use 'make all'"

.PHONY: help
help:
	@echo "Standard build targets:"
	@echo "  default	program files plus default language database"
	@echo "  all		program files plus all language databases"
	@echo "  install	copy program to DESTDIR and databases to DBDIR"
	@echo "  zip		distribution archive"
	@echo "  tags		run etags over source"
	@echo "  clean		clean up build files in top-level directory"
	@echo "  allclean	clean up build files in all directories"
	@echo ""
	@echo "Language database build targets:"
	@echo "  languages.db	  default database, built by default"
	@echo "  top100		  100 languages (also built by 'make all')"
	@echo "  noutf16	  all/top100 languages, but omit UTF16BE and UTF16LE"

.PHONY: all clean allclean tags zip lib install top100 noutf16 top100-noutf16

all:  default crubadan.db top100.db top100-charsets.db

top100: default top100.db top100-charsets.db

noutf16: default top100-noutf16 lang-noutf16.db noutf16-charsets.db

top100-noutf16: top100-noutf16.db top100-noutf16-charsets.db

clean:
	-$(RM) *.o scan_*.so la-strings

allclean: clean
	-( cd framepac ; $(MAKE) clean )
	-( cd langident ; $(MAKE) clean )

tags:
	etags --c++ *.h *.C

install: la-strings languages.db top100.db
	-mkdir -p $(DBDIR)
	$(CP) -p languages.db $(DBDIR)
	$(CP) -p top100.db $(DBDIR)
	-$(CP) -p crubadan.db $(DBDIR)
	$(CP) -p la-strings $(DESTDIR)

zip:	la-strings langident/mklangid langident/whatlang
	-strip la-strings langident/mklangid langident/whatlang
	-$(RM) $(ZIPNAME)
	mkdir $(RELPATH)
	-cp -ip --parents $(DISTFILES) $(RELPATH)
	-( cd $(RELPATH) ; md5sum ${DISTFILES} >MD5SUM )
	-( cd $(RELPATH) ; sha1sum ${DISTFILES} >SHA1SUM )
	zip -mro9q $(ZIPNAME) $(RELPATH)/

lib:	$(LIBRARY)

#########################################################################
## executables

la-strings: la-strings.o $(LIBRARY) langident/langident.a framepac/framepac.a
	$(CCLINK) $(LINKFLAGS) $(CFLAGEXE) la-strings.o $(LIBRARY) \
		langident/langident.a framepac/framepac.a

langident/mklangid:
	( cd langident ; $(MAKE) MAKE_SHAREDLIB=$(MAKE_SHAREDLIB) THREADS=$(THREADS) all )

whatlang: langident

#########################################################################
## data files

languages.db: models/MANIFEST langident/mklangid
	-$(RM) $@
	@echo "*** NOTE: Building the language database requires about 1 GB of RAM for MkLangID ***"
	-langident/mklangid =$@ -v -f ./models/*.lid

charsets.db: languages.db
	-$(RM) $@
	@echo "*** NOTE: Building the charset database requires about 1.5 GB of RAM ***"
	-langident/mklangid ==$^ -v -C 0.0,$@

top100.db: models/top100/MANIFEST langident/mklangid
	-$(RM) $@
	-langident/mklangid =$@ -v -f ./models/top100/*.lid

top100-charsets.db: top100.db
	-$(RM) $@
	-langident/mklangid ==$^ -v -C 0.0,$@

lang-noutf16.db: models/noutf16/MANIFEST langident/mklangid
	-$(RM) $@
	@echo "*** NOTE: Building the language database requires about 1 GB of RAM for MkLangID ***"
	-langident/mklangid =$@ -v -f ./models/noutf16/*.lid

noutf16-charsets.db: lang-noutf16.db
	-$(RM) $@
	-langident/mklangid ==$^ -v -C 0.0,$@

top100-noutf16.db: models/top100noutf16/MANIFEST langident/mklangid
	-$(RM) $@
	@echo "*** NOTE: Building the language database requires about 1 GB of RAM for MkLangID ***"
	-langident/mklangid =$@ -v -f ./models/top100noutf16/*.lid

top100-noutf16-charsets.db: top100-noutf16.db
	-$(RM) $@
	-langident/mklangid ==$^ -v -C 0.0,$@

crubadan.db: Crubadan/MANIFEST langident/mklangid
	-$(RM) $@
	@echo "*** NOTE: Building the language database requires about 1 GB of RAM for MkLangID ***"
	-langident/mklangid =$@ -v -f ./models/*.lid \
		-fc -d 1.15 -fc ./Crubadan/High/*.3gm \
		-d 1.25 -fc ./Crubadan/Med/*.3gm \
		-d 1.45 -fc ./Crubadan/Low/*.3gm \
		-d 1.75 -fc ./Crubadan/Rare/*.3gm

crubadan-charsets.db: crubadan.db
	-$(RM) $@
	-langident/mklangid ==$^ -v -C 0.0,$@

# create any derived files from the language models in the distribution
models/MANIFEST:
	( cd models ; $(MAKE) MANIFEST )

# extract just the non-UTF16 models
models/noutf16/MANIFEST: models/MANIFEST
	-mkdir -p models/noutf16
	-$(RM) -f models/noutf16/*.lid
	(cd models/noutf16 ; ln -s `ls -1 ../*.lid | fgrep -v -e utf16 -e ASCII-16` .)
	(cd models/noutf16 ; ls *.lid >MANIFEST)

# extract just the top-100 models
models/top100/MANIFEST: models/MANIFEST
	-mkdir -p models/top100
	-$(RM) -f models/top100/*.lid
	(cd models/top100 ; ln -s ../{gan,hak,cmn,nan,wuu,hsn,yue,es,en,ar,hi,bn,pt,ru,ja,de,jv,pn,te,vi,mr,fr,kr,ta,it,ur,tr,gu,pl,mly,bh,bho,awa,uk,ml,kn,mai,su,my,ori,fa,prs,pes,rwr,pa,fil,ha,tl,ro,id,nl,snd,th,pst,uz,raj,hoj,mup,yo,az,azb,azj,ig,am,hne,gaz,gax,hae,asm,hr,sr,ku,ckb,kmr,sdh,ceb,si,rkt,tts,zha,mg,ne,so,khm,mad,bar,el,ctg,bgc,mag,dcc,hu,ful,fub,fuc,ca,sna,zu,syl,quf,quh,que,bjj,cs,lmo,bg,ug,nya,bel,kk,kaz,se,aka,xh,bfy,ht,kok,kin,kik,nap,bal,bcc,ilo,vah,pnb,tk,da,he,sl,sk,fi,no,lt,NUM}.*.lid .)
	(cd models/top100 ; rm -f *.\*.lid)
	(cd models/top100 ; ls *.lid >MANIFEST)

# extract just the non-UTF16 top-100 models
models/top100noutf16/MANIFEST: models/top100/MANIFEST
	-mkdir -p models/top100noutf16
	-$(RM) -f models/top100noutf16/*.lid
	(cd models/top100noutf16 ; ln -s `ls -1 ../top100/*.lid | fgrep -v -e utf16 -e ASCII-16` .)
	(cd models/top100noutf16 ; ls *.lid >MANIFEST)

#########################################################################
## object modules

la-strings.o: la-strings.C charset.h extract.h la-strings.h langident/langid.h

charset.o: charset.C charset.h language.h langident/roman.h

extract.o: extract.C extract.h charset.h score.h langident/trie.h langident/langid.h

language.o: language.C language.h charset.h

scan_strings.o: scan_strings.C charset.h extract.h la-strings.h langident/langid.h

scan_strings.so: scan_strings.o $(LIBRARY) langident/langident.a framepac/framepac.a bulk_ext/src/be13_api/beregex.o
	$(CC) -shared -Wl,-soname,$@ -o $@ $^

score.o: score.C score.h charset.h

#########################################################################
## header files -- touching to ensure proper recompilation

charset.h:	language.h langident/trie.h
	touch $@

extract.h:	language.h
	touch $@

wordhash.h: config.h
	touch $@

#########################################################################
## libraries


$(LIBRARY): $(OBJS)
	-$(RM) $(LIBRARY)
	$(LIBRARIAN) $(LIBFLAGS) $(LIBRARY) $(OBJS)
	$(LIBINDEXER) $(LIBIDXFLAGS)

framepac/framepac.a: nonexistentfile
	( cd framepac ; $(MAKE) MAKE_SHAREDLIB=$(MAKE_SHAREDLIB) THREADS=$(THREADS) lib )

langident/langident.a:  nonexistentfile
	( cd langident ; $(MAKE) MAKE_SHAREDLIB=$(MAKE_SHAREDLIB) THREADS=$(THREADS) lib )

nonexistentfile:

#########################################################################
## default compilation rule

.C.o: ; $(CC) $(CFLAGS) $(CPUTYPE) -I$(INCDIR) $(BULK_EXT) -c $<
